Revision c2159551 share/hooks/host_error.rb

View differences:

share/hooks/host_error.rb
18 18

  
19 19
##############################################################################
20 20
# Script to implement host failure tolerance
21
#   It can be set to
22
#           -m migrate VMs to another host. Only for images in shared storage
21
#   One of the following modes must be chosen
22
#           -m resched VMs to another host. (Only for images in shared storage!)
23 23
#           -r recreate VMs running in the host. State will be lost.
24 24
#           -d delete VMs running in the host
25
#
25 26
#   Additional flags
26
#           -f force resubmission of suspended VMs
27
#           -p <n> avoid resubmission if host comes
28
#                  back after n monitoring cycles
27
#           -f resubmit suspended and powered off VMs (only for recreate)
28
#           -p <n> avoid resubmission if host comes back after n monitoring
29
#                 cycles. 0 to disable it. Default is 2.
30
#           -u disables fencing. Fencing is enabled by default. Don't disable it
31
#                 unless you are very sure about what you're doing
29 32
##############################################################################
30 33

  
31
##############################################################################
32
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
33
#
34
# This script needs to fence the error host to prevent split brain VMs. You
35
# may use any fence mechanism and invoke it around L105, using host_name
36
#
37
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
38
#############################################################################
39

  
40 34
ONE_LOCATION=ENV["ONE_LOCATION"]
41 35

  
42 36
if !ONE_LOCATION
43 37
    RUBY_LIB_LOCATION="/usr/lib/one/ruby"
44 38
    VMDIR="/var/lib/one"
45 39
    CONFIG_FILE="/var/lib/one/config"
40
    LOG_FILE="/var/log/one/host_error.log"
46 41
else
47 42
    RUBY_LIB_LOCATION=ONE_LOCATION+"/lib/ruby"
48 43
    VMDIR=ONE_LOCATION+"/var"
49 44
    CONFIG_FILE=ONE_LOCATION+"/var/config"
45
    LOG_FILE=ONE_LOCATION+"/var/host_error.log"
50 46
end
51 47

  
48
FENCE_HOST = File.dirname(__FILE__) + '/fence_host.sh'
49

  
52 50
$: << RUBY_LIB_LOCATION
53 51

  
54 52
require 'opennebula'
55 53
include OpenNebula
56 54

  
57 55
require 'getoptlong'
56
require 'base64'
57
require 'open3'
58

  
59
################################################################################
60
# Arguments
61
################################################################################
58 62

  
59
if !(host_id=ARGV[0])
63
HOST_ID = ARGV[0]
64

  
65
if HOST_ID.nil?
60 66
    exit -1
61 67
end
62 68

  
63
mode   = "-r" # By default, recreate VMs
64
force  = "n"  # By default, don't recreate/delete suspended VMs
65
repeat = nil  # By default, don't wait for monitorization cycles"
69
################################################################################
70
# Methods
71
################################################################################
72

  
73
def log(msg, level="I")
74
    File.open(LOG_FILE, 'a') do |f|
75
        msg.lines do |l|
76
            f.puts "[#{Time.now}][HOST #{HOST_ID}][#{level}] #{l}"
77
        end
78
    end
79
end
80

  
81
def log_error(msg)
82
    log(msg, "E")
83
end
84

  
85
def exit_error
86
    log_error("Exiting due to previous error.")
87
    exit(-1)
88
end
89

  
90
def states_xpath(*arr)
91
    arr.map{|e| "STATE=#{e}"}.join(" or ")
92
end
93

  
94
################################################################################
95
# Options
96
################################################################################
97

  
98
mode    = nil    # **must** be set to something other than nil using the options
99
force   = false  # By default, don't recreate/delete suspended and poweroff VMs
100
repeat  = 2      # By default, wait for 2 monitorization cycles
101
fencing = true
66 102

  
67 103
opts = GetoptLong.new(
68
            ['--migrate',  '-m',GetoptLong::NO_ARGUMENT],
69
            ['--delete',   '-d',GetoptLong::NO_ARGUMENT],
70
            ['--recreate', '-r',GetoptLong::NO_ARGUMENT],
71
            ['--force',    '-f',GetoptLong::NO_ARGUMENT],
72
            ['--pause',    '-p',GetoptLong::REQUIRED_ARGUMENT]
104
            ['--migrate',     '-m', GetoptLong::NO_ARGUMENT],
105
            ['--delete',      '-d', GetoptLong::NO_ARGUMENT],
106
            ['--recreate',    '-r', GetoptLong::NO_ARGUMENT],
107
            ['--force',       '-f', GetoptLong::NO_ARGUMENT],
108
            ['--pause',       '-p', GetoptLong::REQUIRED_ARGUMENT],
109
            ['--no-fencing',  '-u', GetoptLong::NO_ARGUMENT]
73 110
        )
74 111

  
75 112
begin
76 113
    opts.each do |opt, arg|
77 114
        case opt
78 115
            when '--migrate'
79
                mode="-m"
116
                mode = :migrate
80 117
            when '--delete'
81
                mode="-d"
118
                mode = :delete
82 119
            when '--recreate'
83
                mode="-r"
120
                mode = :recreate
84 121
            when '--force'
85
                force  = "y"
122
                force = true
86 123
            when '--pause'
87 124
                repeat = arg.to_i
125
            when '--no-fencing'
126
                fencing = false
88 127
        end
89 128
    end
90 129
rescue Exception => e
91
    exit(-1)
130
    log_error e.to_s
131
    exit_error
132
end
133

  
134
if mode.nil?
135
    log_error "Exiting. A mode must be supplied."
136
    exit_error
92 137
end
93 138

  
139
################################################################################
140
# Main
141
################################################################################
142

  
143
log "Hook launched"
144

  
94 145
begin
95 146
    client = Client.new()
96 147
rescue Exception => e
97
    puts "Error: #{e}"
98
    exit -1
148
    log_error e.to_s
149
    exit_error
150
end
151

  
152
sys  = OpenNebula::System.new(client)
153
conf = sys.get_configuration
154

  
155
begin
156
    MONITORING_INTERVAL = conf['MONITORING_INTERVAL'] || 60
157
rescue Exception => e
158
    log_error "Could not get MONITORING_INTERVAL"
159
    log_error e.to_s
160
    exit_error
99 161
end
100 162

  
101 163
# Retrieve hostname
102
host  =  OpenNebula::Host.new_with_id(host_id, client)
103
rc = host.info
104
exit -1 if OpenNebula.is_error?(rc)
105
host_name = host.name
106

  
107
if repeat
108
    # Retrieve host monitor interval
109
    monitor_interval = nil
110
    File.readlines(CONFIG_FILE).each{|line|
111
         monitor_interval = line.split("=").last.to_i if /MONITORING_INTERVAL/=~line
112
    }
164
host = OpenNebula::Host.new_with_id(HOST_ID, client)
165
rc   = host.info
166

  
167
if OpenNebula.is_error?(rc)
168
    log_error "Could not get host info"
169
    exit_error
170
end
171

  
172
log "hostname: #{host.name}"
173

  
174
if repeat > 0
175
    log "Wait #{repeat} cycles."
176

  
113 177
    # Sleep through the desired number of monitor interval
114
    sleep (repeat * monitor_interval)
178
    period = repeat * MONITORING_INTERVAL.to_i
179

  
180
    log "Sleeping #{period} seconds."
181
    sleep(period)
182

  
183
    rc = host.info
184
    if OpenNebula.is_error?(rc)
185
        log_error "Could not get host info"
186
        exit_error
187
    end
115 188

  
116 189
    # If the host came back, exit! avoid duplicated VMs
117
    exit 0 if host.state != 3
190
    if host.state != 3
191
        log "Exiting. Host came back after waiting."
192
        exit 0
193
    end
194
end
195

  
196
# Do fencing
197
if fencing
198
    host64 = Base64::strict_encode64(host.to_xml)
199

  
200
    log "Fencing enabled"
201

  
202
    begin
203
        i, oe, w = Open3.popen2e(FENCE_HOST, host64)
204
        if w.value.success?
205
            log oe.read
206
            log "Fencing success"
207
        else
208
            raise oe.read << "\n" << "Fencing error"
209
        end
210
    rescue Exception => e
211
        log_error e.to_s
212
        exit_error
213
    end
214
else
215
    log "WARNING: Fencing disabled"
118 216
end
119 217

  
120 218
# Loop through all vms
121 219
vms = VirtualMachinePool.new(client)
122
rc = vms.info_all
123
exit -1 if OpenNebula.is_error?(rc)
220
rc  = vms.info_all
221

  
222
if OpenNebula.is_error?(rc)
223
    exit_error "Could not get vm pool"
224
end
124 225

  
226
# STATE=3: ACTIVE (LCM unknown)
227
# STATE=5: SUSPENDED
228
# STATE=8: POWEROFF
125 229

  
126
state = "STATE=3"
127
state += " or STATE=5 or STATE=8" if force == "y"
230
if mode == :recreate && !force
231
    log "states: 3"
232
    state = states_xpath(3)
233
else
234
    log "states: 3, 5, 8"
235
    state = states_xpath(3, 5, 8)
236
end
128 237

  
129
vm_ids_array = vms.retrieve_elements("/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host_name}\" and last()]/../../ID")
238
xpath = "/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host.name}\" and last()]"
239
vm_ids_array = vms.retrieve_elements("#{xpath}/../../ID")
130 240

  
131 241
if vm_ids_array
242
    log "vms: #{vm_ids_array}"
243

  
132 244
    vm_ids_array.each do |vm_id|
133
        vm=OpenNebula::VirtualMachine.new_with_id(vm_id, client)
134
        vm.info
245
        vm = OpenNebula::VirtualMachine.new_with_id(vm_id, client)
246
        rc = vm.info
135 247

  
136
        if mode == "-r"
248
        if OpenNebula.is_error?(rc)
249
            log_error "Could not get info of VM #{vm_id}"
250
            next
251
        end
252

  
253
        case mode
254
        when :recreate
255
            log "recreate #{vm_id}"
137 256
            vm.delete(true)
138
        elsif mode == "-d"
257
        when :delete
258
            log "delete #{vm_id}"
139 259
            vm.delete
140
        elsif mode == "-m"
260
        when :migrate
261
            log "resched #{vm_id}"
141 262
            vm.resched
263
        else
264
            log_error "unkown mode '#{mode}'"
265
            exit_error
142 266
        end
143 267
    end
268
else
269
    log "No VMs found."
144 270
end
145 271

  
272
log "Hook finished"
273
exit 0

Also available in: Unified diff