Revision c2159551

View differences:

install.sh
1224 1224
# HOOK scripts, to be installed under $VAR_LOCATION/remotes/hooks
1225 1225
#-------------------------------------------------------------------------------
1226 1226

  
1227
HOOK_FT_FILES="share/hooks/host_error.rb"
1227
HOOK_FT_FILES="share/hooks/host_error.rb \
1228
               share/hooks/fence_host.sh"
1228 1229

  
1229 1230
#-------------------------------------------------------------------------------
1230 1231
# Installation scripts, to be installed under $SHARE_LOCATION
share/etc/oned.conf
696 696
#*******************************************************************************
697 697
# This hook is used to perform recovery actions when a host fails.
698 698
# Script to implement host failure tolerance
699
#   It can be set to
700
#           -m migrate VMs to another host. Only for images in shared storage
699
#   One of the following modes must be chosen
700
#           -m resched VMs to another host. (Only for images in shared storage!)
701 701
#           -r recreate VMs running in the host. State will be lost.
702 702
#           -d delete VMs running in the host
703
#
703 704
#   Additional flags
704
#           -f force resubmission of suspended VMs
705
#           -p <n> avoid resubmission if host comes
706
#                  back after n monitoring cycles
705
#           -f resubmit suspended and powered off VMs (only for recreate)
706
#           -p <n> avoid resubmission if host comes back after n monitoring
707
#                 cycles. 0 to disable it. Default is 2.
708
#           -u disables fencing. Fencing is enabled by default. Don't disable it
709
#                 unless you are very sure about what you're doing
707 710
#*******************************************************************************
708 711
#
709 712
#HOST_HOOK = [
share/hooks/fence_host.sh
1
#!/bin/bash
2

  
3
# -------------------------------------------------------------------------- #
4
# Copyright 2002-2016, OpenNebula Project, OpenNebula Systems                #
5
#                                                                            #
6
# Licensed under the Apache License, Version 2.0 (the "License"); you may    #
7
# not use this file except in compliance with the License. You may obtain    #
8
# a copy of the License at                                                   #
9
#                                                                            #
10
# http://www.apache.org/licenses/LICENSE-2.0                                 #
11
#                                                                            #
12
# Unless required by applicable law or agreed to in writing, software        #
13
# distributed under the License is distributed on an "AS IS" BASIS,          #
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
15
# See the License for the specific language governing permissions and        #
16
# limitations under the License.                                             #
17
#--------------------------------------------------------------------------- #
18

  
19
##############################################################################
20
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
21
#
22
# This script needs to be modified to enable fencing of the host. By default it
23
# will fail, as the first line is 'exit 1'. You will need to remove it.
24
#
25
# In order to perform the fencing, you will probably need to install a fencing
26
# utility. They are typically found in: fence-agents-all (CentOS) and fence-
27
# agents (Ubuntu). They come with many utilities: fence_ilo, fence_ipmilan,
28
# fence_apc, etc...
29
#
30
# To call the fencing utility, you will need to pass some parameters, which are
31
# typically the iLO IP of the host, etc. We recommend you enter this information
32
# in the host's template, and pick it up using the xpath example below. AS AN
33
# EXAMPLE (only an example) the script below expects that you have defined a
34
# parameter called FENCE_IP in the Host's template, and it will rely on that to
35
# call the fencing mechanism. You should customize this to your needs. It is
36
# perfectly OK to discard the code below and use a different mechanism, like
37
# storing the information required to perform the fencing in a separate CMDB,
38
# etc. However, you will probably need to get the host's NAME, which should be
39
# done as shown below.
40
#
41
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
42
#############################################################################
43

  
44
# @param $1 the host information in base64
45
# @return 0 on success. Make sure this script does not return 0 if it fails.
46

  
47
# To enable remove this line
48
exit 1
49

  
50
#-------------------------------------------------------------------------------
51
# Get host parameters with XPATH
52
#-------------------------------------------------------------------------------
53

  
54
if [ -z "$ONE_LOCATION" ]; then
55
    XPATH=/var/lib/one/remotes/datastore/xpath.rb
56
else
57
    XPATH=$ONE_LOCATION/var/remotes/datastore/xpath.rb
58
fi
59

  
60
if [ ! -x "$XPATH" ]; then
61
    echo "XPATH not found: $XPATH"
62
    exit 1
63
fi
64

  
65
XPATH="${XPATH} -b $1"
66

  
67
unset i j XPATH_ELEMENTS
68

  
69
while IFS= read -r -d '' element; do
70
    XPATH_ELEMENTS[i++]="$element"
71
done < <($XPATH     /HOST/ID \
72
                    /HOST/NAME \
73
                    /HOST/TEMPLATE/FENCE_IP )
74

  
75
HOST_ID="${XPATH_ELEMENTS[j++]}"
76
NAME="${XPATH_ELEMENTS[j++]}"
77
FENCE_IP="${XPATH_ELEMENTS[j++]}"
78

  
79
if [ -z "$FENCE_IP" ]; then
80
    echo "Fence ip not found"
81
    exit 1
82
fi
83

  
84
#-------------------------------------------------------------------------------
85
# Fence
86
#-------------------------------------------------------------------------------
87

  
88
# Example:
89
# fence_ilo -a $FENCE_IP -l <username> -p <password>
share/hooks/host_error.rb
18 18

  
19 19
##############################################################################
20 20
# Script to implement host failure tolerance
21
#   It can be set to
22
#           -m migrate VMs to another host. Only for images in shared storage
21
#   One of the following modes must be chosen
22
#           -m resched VMs to another host. (Only for images in shared storage!)
23 23
#           -r recreate VMs running in the host. State will be lost.
24 24
#           -d delete VMs running in the host
25
#
25 26
#   Additional flags
26
#           -f force resubmission of suspended VMs
27
#           -p <n> avoid resubmission if host comes
28
#                  back after n monitoring cycles
27
#           -f resubmit suspended and powered off VMs (only for recreate)
28
#           -p <n> avoid resubmission if host comes back after n monitoring
29
#                 cycles. 0 to disable it. Default is 2.
30
#           -u disables fencing. Fencing is enabled by default. Don't disable it
31
#                 unless you are very sure about what you're doing
29 32
##############################################################################
30 33

  
31
##############################################################################
32
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
33
#
34
# This script needs to fence the error host to prevent split brain VMs. You
35
# may use any fence mechanism and invoke it around L105, using host_name
36
#
37
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
38
#############################################################################
39

  
40 34
ONE_LOCATION=ENV["ONE_LOCATION"]
41 35

  
42 36
if !ONE_LOCATION
43 37
    RUBY_LIB_LOCATION="/usr/lib/one/ruby"
44 38
    VMDIR="/var/lib/one"
45 39
    CONFIG_FILE="/var/lib/one/config"
40
    LOG_FILE="/var/log/one/host_error.log"
46 41
else
47 42
    RUBY_LIB_LOCATION=ONE_LOCATION+"/lib/ruby"
48 43
    VMDIR=ONE_LOCATION+"/var"
49 44
    CONFIG_FILE=ONE_LOCATION+"/var/config"
45
    LOG_FILE=ONE_LOCATION+"/var/host_error.log"
50 46
end
51 47

  
48
FENCE_HOST = File.dirname(__FILE__) + '/fence_host.sh'
49

  
52 50
$: << RUBY_LIB_LOCATION
53 51

  
54 52
require 'opennebula'
55 53
include OpenNebula
56 54

  
57 55
require 'getoptlong'
56
require 'base64'
57
require 'open3'
58

  
59
################################################################################
60
# Arguments
61
################################################################################
58 62

  
59
if !(host_id=ARGV[0])
63
HOST_ID = ARGV[0]
64

  
65
if HOST_ID.nil?
60 66
    exit -1
61 67
end
62 68

  
63
mode   = "-r" # By default, recreate VMs
64
force  = "n"  # By default, don't recreate/delete suspended VMs
65
repeat = nil  # By default, don't wait for monitorization cycles"
69
################################################################################
70
# Methods
71
################################################################################
72

  
73
def log(msg, level="I")
74
    File.open(LOG_FILE, 'a') do |f|
75
        msg.lines do |l|
76
            f.puts "[#{Time.now}][HOST #{HOST_ID}][#{level}] #{l}"
77
        end
78
    end
79
end
80

  
81
def log_error(msg)
82
    log(msg, "E")
83
end
84

  
85
def exit_error
86
    log_error("Exiting due to previous error.")
87
    exit(-1)
88
end
89

  
90
def states_xpath(*arr)
91
    arr.map{|e| "STATE=#{e}"}.join(" or ")
92
end
93

  
94
################################################################################
95
# Options
96
################################################################################
97

  
98
mode    = nil    # **must** be set to something other than nil using the options
99
force   = false  # By default, don't recreate/delete suspended and poweroff VMs
100
repeat  = 2      # By default, wait for 2 monitorization cycles
101
fencing = true
66 102

  
67 103
opts = GetoptLong.new(
68
            ['--migrate',  '-m',GetoptLong::NO_ARGUMENT],
69
            ['--delete',   '-d',GetoptLong::NO_ARGUMENT],
70
            ['--recreate', '-r',GetoptLong::NO_ARGUMENT],
71
            ['--force',    '-f',GetoptLong::NO_ARGUMENT],
72
            ['--pause',    '-p',GetoptLong::REQUIRED_ARGUMENT]
104
            ['--migrate',     '-m', GetoptLong::NO_ARGUMENT],
105
            ['--delete',      '-d', GetoptLong::NO_ARGUMENT],
106
            ['--recreate',    '-r', GetoptLong::NO_ARGUMENT],
107
            ['--force',       '-f', GetoptLong::NO_ARGUMENT],
108
            ['--pause',       '-p', GetoptLong::REQUIRED_ARGUMENT],
109
            ['--no-fencing',  '-u', GetoptLong::NO_ARGUMENT]
73 110
        )
74 111

  
75 112
begin
76 113
    opts.each do |opt, arg|
77 114
        case opt
78 115
            when '--migrate'
79
                mode="-m"
116
                mode = :migrate
80 117
            when '--delete'
81
                mode="-d"
118
                mode = :delete
82 119
            when '--recreate'
83
                mode="-r"
120
                mode = :recreate
84 121
            when '--force'
85
                force  = "y"
122
                force = true
86 123
            when '--pause'
87 124
                repeat = arg.to_i
125
            when '--no-fencing'
126
                fencing = false
88 127
        end
89 128
    end
90 129
rescue Exception => e
91
    exit(-1)
130
    log_error e.to_s
131
    exit_error
132
end
133

  
134
if mode.nil?
135
    log_error "Exiting. A mode must be supplied."
136
    exit_error
92 137
end
93 138

  
139
################################################################################
140
# Main
141
################################################################################
142

  
143
log "Hook launched"
144

  
94 145
begin
95 146
    client = Client.new()
96 147
rescue Exception => e
97
    puts "Error: #{e}"
98
    exit -1
148
    log_error e.to_s
149
    exit_error
150
end
151

  
152
sys  = OpenNebula::System.new(client)
153
conf = sys.get_configuration
154

  
155
begin
156
    MONITORING_INTERVAL = conf['MONITORING_INTERVAL'] || 60
157
rescue Exception => e
158
    log_error "Could not get MONITORING_INTERVAL"
159
    log_error e.to_s
160
    exit_error
99 161
end
100 162

  
101 163
# Retrieve hostname
102
host  =  OpenNebula::Host.new_with_id(host_id, client)
103
rc = host.info
104
exit -1 if OpenNebula.is_error?(rc)
105
host_name = host.name
106

  
107
if repeat
108
    # Retrieve host monitor interval
109
    monitor_interval = nil
110
    File.readlines(CONFIG_FILE).each{|line|
111
         monitor_interval = line.split("=").last.to_i if /MONITORING_INTERVAL/=~line
112
    }
164
host = OpenNebula::Host.new_with_id(HOST_ID, client)
165
rc   = host.info
166

  
167
if OpenNebula.is_error?(rc)
168
    log_error "Could not get host info"
169
    exit_error
170
end
171

  
172
log "hostname: #{host.name}"
173

  
174
if repeat > 0
175
    log "Wait #{repeat} cycles."
176

  
113 177
    # Sleep through the desired number of monitor interval
114
    sleep (repeat * monitor_interval)
178
    period = repeat * MONITORING_INTERVAL.to_i
179

  
180
    log "Sleeping #{period} seconds."
181
    sleep(period)
182

  
183
    rc = host.info
184
    if OpenNebula.is_error?(rc)
185
        log_error "Could not get host info"
186
        exit_error
187
    end
115 188

  
116 189
    # If the host came back, exit! avoid duplicated VMs
117
    exit 0 if host.state != 3
190
    if host.state != 3
191
        log "Exiting. Host came back after waiting."
192
        exit 0
193
    end
194
end
195

  
196
# Do fencing
197
if fencing
198
    host64 = Base64::strict_encode64(host.to_xml)
199

  
200
    log "Fencing enabled"
201

  
202
    begin
203
        i, oe, w = Open3.popen2e(FENCE_HOST, host64)
204
        if w.value.success?
205
            log oe.read
206
            log "Fencing success"
207
        else
208
            raise oe.read << "\n" << "Fencing error"
209
        end
210
    rescue Exception => e
211
        log_error e.to_s
212
        exit_error
213
    end
214
else
215
    log "WARNING: Fencing disabled"
118 216
end
119 217

  
120 218
# Loop through all vms
121 219
vms = VirtualMachinePool.new(client)
122
rc = vms.info_all
123
exit -1 if OpenNebula.is_error?(rc)
220
rc  = vms.info_all
221

  
222
if OpenNebula.is_error?(rc)
223
    exit_error "Could not get vm pool"
224
end
124 225

  
226
# STATE=3: ACTIVE (LCM unknown)
227
# STATE=5: SUSPENDED
228
# STATE=8: POWEROFF
125 229

  
126
state = "STATE=3"
127
state += " or STATE=5 or STATE=8" if force == "y"
230
if mode == :recreate && !force
231
    log "states: 3"
232
    state = states_xpath(3)
233
else
234
    log "states: 3, 5, 8"
235
    state = states_xpath(3, 5, 8)
236
end
128 237

  
129
vm_ids_array = vms.retrieve_elements("/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host_name}\" and last()]/../../ID")
238
xpath = "/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host.name}\" and last()]"
239
vm_ids_array = vms.retrieve_elements("#{xpath}/../../ID")
130 240

  
131 241
if vm_ids_array
242
    log "vms: #{vm_ids_array}"
243

  
132 244
    vm_ids_array.each do |vm_id|
133
        vm=OpenNebula::VirtualMachine.new_with_id(vm_id, client)
134
        vm.info
245
        vm = OpenNebula::VirtualMachine.new_with_id(vm_id, client)
246
        rc = vm.info
135 247

  
136
        if mode == "-r"
248
        if OpenNebula.is_error?(rc)
249
            log_error "Could not get info of VM #{vm_id}"
250
            next
251
        end
252

  
253
        case mode
254
        when :recreate
255
            log "recreate #{vm_id}"
137 256
            vm.delete(true)
138
        elsif mode == "-d"
257
        when :delete
258
            log "delete #{vm_id}"
139 259
            vm.delete
140
        elsif mode == "-m"
260
        when :migrate
261
            log "resched #{vm_id}"
141 262
            vm.resched
263
        else
264
            log_error "unkown mode '#{mode}'"
265
            exit_error
142 266
        end
143 267
    end
268
else
269
    log "No VMs found."
144 270
end
145 271

  
272
log "Hook finished"
273
exit 0

Also available in: Unified diff