#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import os, sys, re, tempfile, shutil, pickle, getpass, subprocess, time, glob, ConfigParser
from xml.dom import minidom

try:
    from optparse import Option, OptionParser 
    from gppylib.gpparseopts import OptParser, OptChecker
    from gppylib.gplog import get_default_logger, setup_tool_logging
    from gppylib.commands.unix import getLocalHostname, getUserName, SYSTEM
    from gppylib.commands.base import WorkerPool, Command, REMOTE
    from gppylib.gpcheckutil import HostType, hosttype_str
    from hawqpylib.hawqlib import remote_ssh_output
    from pygresql.pgdb import DatabaseError
    from pygresql import pg
    import stat

except ImportError, e:    
    sys.exit('Cannot import modules.  Please check that you have sourced greenplum_path.sh.  Detail: ' + str(e))


class GpCheckError(Exception):
    pass


class GpCheckInfo:
    def __init__(self):
        self.is_root = (os.geteuid() == 0)
        self.host_type = HostType.GPCHECK_HOSTTYPE_UNDEFINED
        self.appliance_version = None

        # record gpcheck_hostdump data for each host
        self.hosts = dict() # hostname => GpCheckHost obj

        # record HAWQ configuration
        self.hawq_gucs = dict() # guc name => (master_value, segment_value)
        self.hawq_segment_configuration = None
        self.hawq_collected_ok = False # if successfully collect HAWQ gucs

        self.collection_start_time = 0 # used in NTPD testing
        self.collection_end_time = 0 # used in NTPD testing


class GpCheckHost:
    def __init__(self, name, is_namenode=False):
        self.hostname = name
        self.datafile = None # pickle file on each host
        self.data = None # `gpcheck_hostdump` collected data for each host
        self.is_namenode = is_namenode

    def __str__(self):
        s = "%s datafile(%s)" % (self.hostname, self.datafile)
        if self.is_namenode:
            s += " namenode"
        return s


class GpCheckConfig:
    def __init__(self):
        self.parser = ConfigParser.RawConfigParser()
        self.gpcheck_config_version = 0

        self.mount_points = set()
        self.sysctl_expected = dict()

        self.limits_expected = { # default value for limits.conf
                ("soft", "nofile"): 2900000,
                ("hard", "nofile"): 2900000,
                ("soft", "nproc") : 131072,
                ("hard", "nproc") : 131072 }

        self.diskusage_mounts = []
        self.diskusage_usagemax = 90 # max disk usage percentage

        self.hdfs_expected = { # default value for HDFS configuration
                "dfs.mem.namenode.heap": 8192,
                "dfs.mem.datanode.heap": 8192 }
        self.hdfs_non_expected = {}
        self.hdfs_ha_expected = {}
        self.hdfs_kerberos_expected = {}
        self.hdfs_ha_kerberos_expected = {}

        self.yarn_expected = {}
        self.yarn_non_expected = {}
        self.yarn_ha_expected = {}
        self.yarn_kerberos_expected = {}
        self.yarn_ha_kerberos_expected = {}

        self.hawq_expected = {}
        self.hawq_kerberos_expected = {}
        self.hawq_yarn_expected = {}


    def readConfigFile(self, config_file):
        parsed_list = self.parser.read(config_file)
        if len(parsed_list) != 1:
            raise GpCheckError("cannot open file!")

        if not self.parser.has_section("linux.sysctl"):
            raise GpCheckError("require section 'linux.sysctl'")

        section = "global"
        if self.parser.has_option(section, "configfile_version"):
            self.gpcheck_config_version = self.parser.getint(section, "configfile_version")

        section = "linux.mount"
        if self.parser.has_option(section, "mount.points"):
            for p in self.parser.get(section, "mount.points").split(","):
                self.mount_points.add(p.strip())
        
        section = 'linux.sysctl'
        for opt in self.parser.options(section):
            if re.match('sysctl\.', opt):
                fields = opt.split('sysctl.')
                if len(fields) != 2:
                    raise GpCheckError("Bad config line entry '%s'" % opt)
                self.sysctl_expected[fields[1]] = self.parser.get(section, opt)

        section = "linux.limits"
        for opt in self.parser.options(section):
            key = tuple(opt.split("."))
            self.limits_expected[key] = self.parser.getint(section, opt)

        section = "linux.diskusage"
        if self.parser.has_option(section, "diskusage.monitor.mounts"):
            self.diskusage_mounts = [m.strip() for m in self.parser.get(section, "diskusage.monitor.mounts").split(",")]
        if self.parser.has_option(section, "diskusage.monitor.usagemax"):
            self.diskusage_usagemax = self.parser.get(section, "diskusage.monitor.usagemax")
            try:
                if self.diskusage_usagemax[-1] == "%":
                    self.diskusage_usagemax = int(self.diskusage_usagemax[:-1])
                else:
                    self.diskusage_usagemax = int(self.diskusage_usagemax)

            except Exception, e:
                raise GpCheckError("Bad config entry value '%s' for 'diskusage.monitor.usagemax': %s" %
                        (self.diskusage_usagemax, e))

        if not self.parser.has_section('hdfs.base'):
            if not self.parser.has_section("hdfs"):
                raise GpCheckError("require section 'hdfs'")

            section = 'hdfs'
            for opt in self.parser.options(section):
                self.hdfs_expected[opt] = self.parser.get(section, opt)
            try:
                self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
                self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
            except ValueError, e:
                raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e)
        else:
            section = 'hdfs.base'
            for opt in self.parser.options(section):
                self.hdfs_expected[opt] = self.parser.get(section, opt)
            try:
                self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
                self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
            except ValueError, e:
                raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e)

            section = 'hdfs.non'
            for opt in self.parser.options(section):
                self.hdfs_non_expected[opt] = self.parser.get(section, opt)

            section = 'hdfs.ha'
            for opt in self.parser.options(section):
                self.hdfs_ha_expected[opt] = self.parser.get(section, opt)

            section = 'hdfs.kerberos'
            for opt in self.parser.options(section):
                self.hdfs_kerberos_expected[opt] = self.parser.get(section, opt)

            section = 'hdfs.ha.kerberos'
            for opt in self.parser.options(section):
                self.hdfs_ha_kerberos_expected[opt] = self.parser.get(section, opt)

            section = 'yarn.base'
            for opt in self.parser.options(section):
                self.yarn_expected[opt] = self.parser.get(section, opt)

            section = 'yarn.non'
            for opt in self.parser.options(section):
                self.yarn_non_expected[opt] = self.parser.get(section, opt)

            section = 'yarn.ha'
            for opt in self.parser.options(section):
                self.yarn_ha_expected[opt] = self.parser.get(section, opt)

            section = 'yarn.kerberos'
            for opt in self.parser.options(section):
                self.yarn_kerberos_expected[opt] = self.parser.get(section, opt)

            section = 'yarn.ha.kerberos'
            for opt in self.parser.options(section):
                self.yarn_ha_kerberos_expected[opt] = self.parser.get(section, opt)

            section = 'hawq.base'
            for opt in self.parser.options(section):
                self.hawq_expected[opt] = self.parser.get(section, opt)

            section = 'hawq.kerberos'
            for opt in self.parser.options(section):
                self.hawq_kerberos_expected[opt] = self.parser.get(section, opt)

            section = 'hawq.yarn'
            for opt in self.parser.options(section):
                self.hawq_yarn_expected[opt] = self.parser.get(section, opt)

###### Global Variables #############
logger = get_default_logger()
EXECNAME = os.path.split(__file__)[-1]
setup_tool_logging(EXECNAME,getLocalHostname(),getUserName())

options = None
GPHOME = None
GPCHECK_CONFIG_FILE = None
HADOOP_HOME = None

gpcheck_info = GpCheckInfo()
gpcheck_config = GpCheckConfig()
pool = WorkerPool()
tmpdir = None
found_errors = 0

HAWQ_GUC_MEMORY    = "hawq_re_memory_overcommit_max"


def checkPlatform():
    host_type_map = { "linux": HostType.GPCHECK_HOSTTYPE_GENERIC_LINUX,
                      "sunos": HostType.GPCHECK_HOSTTYPE_GENERIC_SOLARIS }
    try:
        gpcheck_info.host_type = host_type_map[SYSTEM.getName()]
        logger.info("Detected platform: %s" % hosttype_str(gpcheck_info.host_type))

    except KeyError:
        raise GpCheckError("No tests exists for this platform in gpcheck")


def parse_host_list_file(host_file):
    host_list = list()
    with open(host_file) as f:
        hosts = f.readlines()
    for host in hosts:
        host = host.split("#",1)[0].strip()
        if host:
            host_list.append(host)
    return host_list

def parseargs():
    global options, GPHOME, HADOOP_HOME, GPCHECK_CONFIG_FILE

    parser = OptParser(option_class=OptChecker, version='%prog version $Revision: #1 $')
    parser.remove_option('-h')
    parser.add_option('-?', '--help', action='help')
    parser.add_option('--verbose', action='store_true')
    parser.add_option('--stdout', action='store_true')
    parser.add_option('--zipout', action='store_true')
    parser.add_option('--zipin', type='string')
    parser.add_option('--gphome', type='string')
    # for HDFS xml and memory check
    parser.add_option('--hadoop', '--hadoop-home', type='string')
    parser.add_option('--hdfs', action='store_true')
    parser.add_option('--hdfs-ha', dest="hdfs_ha", action='store_true')
    parser.add_option('--yarn', action='store_true')
    parser.add_option('--yarn-ha', dest="yarn_ha", action='store_true')
    parser.add_option('--kerberos', action='store_true')

    parser.add_option('-c', '--config', type='string') # optional: gpcheck config file path
    parser.add_option('-f', '--file',  type='string')  # host file, for testing a list of hosts
    parser.add_option('-h', '--host',  type='string')  # test a single host

    (options, args) = parser.parse_args()
    if len(args) > 0:
        if args[0] == 'help':
            parser.print_help(sys.stderr)
            sys.exit(0)

    # GPHOME must be found
    GPHOME = options.gphome if options.gphome else os.environ.get("GPHOME")
    if not GPHOME:
        raise GpCheckError("GPHOME not set, must be specified in --gphome")
    GPCHECK_CONFIG_FILE = options.config if options.config else "%s/etc/gpcheck.cnf" % GPHOME
    logger.info("Checks uses config file: %s", GPCHECK_CONFIG_FILE)

    HADOOP_HOME = options.hadoop if options.hadoop else os.environ.get("HADOOP_HOME")

    if not HADOOP_HOME:
        checkFailed(None, "utility will SKIP HDFS configuration check because HADOOP_HOME is not specified in environment variable or --hadoop")

    if options.yarn and not HADOOP_HOME:
        options.yarn = False
        checkFailed(None, "utility will SKIP YARN configuration check because HADOOP_HOME is not specified in environment variable or --hadoop")

    # params check
    if not options.file and not options.host and not options.zipin:
        raise GpCheckError(" --file or --host or --zipin must be specified")

    if options.file and options.host:
        raise GpCheckError(" You can specify either --file or --host, but not both")

    if options.stdout and options.zipout:
        raise GpCheckError(" You can specify either --stdout or --zipout, but not both")


def readConfigFile():
    try:
        gpcheck_config.readConfigFile(GPCHECK_CONFIG_FILE)

    except Exception, e:
        raise GpCheckError("Field to read gpcheck config file '%s':\n%s" % (GPCHECK_CONFIG_FILE, e))


def checkFailed(host, msg):
    global found_errors
    found_errors += 1
    if host:
        logger.error("host(%s): %s", host, msg)
    else:
        logger.error(msg)


def getHDFSNamenodeHost():
    core_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/core-site.xml")
    hdfs_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/hdfs-site.xml")
    logger.info("try to detect namenode from %s" % core_site_file)

    # for processing property xml
    getPropName =  lambda node: node.getElementsByTagName('name')[0].childNodes[0].data
    getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data

    # read namenode address from core-site.xml
    with open(core_site_file) as f:
        xmldoc = minidom.parse(f)
    namenode_addr = ''
    for node in xmldoc.getElementsByTagName('property'):
        if getPropName(node) == 'fs.default.name' or getPropName(node) == 'fs.defaultFS':
            fsurl = getPropValue(node).strip()
            namenode_list_alias = re.search(r"//([^:/]*)", fsurl).group(1)
            if_ha_disabled = re.search(".*:[0-9]+$", fsurl)
            if if_ha_disabled:
                namenode_addr = namenode_list_alias
            else:
                namenode_addr = ''
            break

    # run hostname command on remote to get actual hostname
    if namenode_addr == '':
        ha_namenode_list = ''
        default_namenode_alias = ''
        with open(hdfs_site_file) as f:
            xmldoc = minidom.parse(f)
        for node in xmldoc.getElementsByTagName('property'):
            if re.search('dfs.ha.namenodes.*', getPropName(node).strip()):
                ha_namenode_list = getPropValue(node).strip()
                default_namenode_alias = ha_namenode_list.split(',')[0].strip()
                break

        if ha_namenode_list == '':
            logger.error("cannot detect namenode from %s" % core_site_file)
            raise GpCheckError("cannot detect namenode from %s" % core_site_file)
            #sys.exit(1)
        else:
            with open(hdfs_site_file) as f:
                xmldoc = minidom.parse(f)
            for node in xmldoc.getElementsByTagName('property'):
                namenode_rpc_address = "dfs.namenode.rpc-address.%s.%s" % (namenode_list_alias,
                                                                           default_namenode_alias)
                if getPropName(node) == namenode_rpc_address:
                    default_namenode_rpc_address = getPropValue(node).strip()
                    namenode_addr = default_namenode_rpc_address.split(':')[0].strip()

    if namenode_addr == '':
        raise GpCheckError("cannot detect namenode from %s" % core_site_file)
    else:
        cmd = Command(namenode_addr, "hostname", REMOTE, namenode_addr)
        pool.addCommand(cmd)
        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running 'hostname' command: %s" % i.results.stderr.strip())
            namenode_host = i.results.stdout.strip()
        logger.info("detect namenode hostname to be %s" % namenode_host)
    return namenode_host


def createHostList():
    if options.verbose:
        logger.info("trying to deduplicate hosts...")

    hostlist = []
    # read the host file if present
    if options.file:
        try:
            with open(options.file, "r") as f:
                hostlist = [line.strip() for line in f.readlines() if line.strip()]

        except IOError, e:
            raise GpCheckError("error reading host file '%s': %s" % (options.file, str(e)))
    else:
        hostlist.append(options.host)

    # get actual hostname and deduplicate
    try:
        for hostname in hostlist:
            cmd = Command(hostname, "hostname", REMOTE, hostname)
            pool.addCommand(cmd)

        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running 'hostname' on host '%s': %s" % (i.remoteHost, i.results.stderr.strip()))

            actualHostname = i.results.stdout.strip()
            if actualHostname not in gpcheck_info.hosts:
                gpcheck_info.hosts[actualHostname] = GpCheckHost(actualHostname)

    except Exception, e:
        raise GpCheckError("failed to collect 'hostname' on servers: %s" % str(e))

    if options.verbose:
        logger.info("trying to deduplicate hosts [success]")

    if HADOOP_HOME:
        try:
            namenode_host = getHDFSNamenodeHost()
            if namenode_host in hostlist:
                gpcheck_info.hosts[namenode_host] = GpCheckHost(namenode_host, is_namenode=True)
            else:
                logger.warning("utility will skip HDFS namenode check since it's not in current host list.")

        except Exception, e:
            checkFailed(None, "utility will SKIP HDFS namenode check: %s" % str(e))


def runCollections():
    logger.info("trying to collect server configuration...")

    # run gpcheck_hostdump on each server
    runCollectionOnServers()
    # copy hostdump file to master
    copyFilesLocally()
    # delete hostdump file on remote servers
    deleteRemoteFiles()

    logger.info("trying to collect server configuration [success]")


def runCollectionOnServers():
    gpcheck_info.collection_start_time = time.time()

    def getDumpCommand():
        if gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_LINUX:
            host_type_cl = "--linux"
        elif gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_SOLARIS:
            host_type_cl = "--solaris"
        else:
            raise GpCheckError("unsupported host type")

        cmd = "%s/sbin/gpcheck_hostdump --hawq %s" % (GPHOME, host_type_cl)
        cmd += " --sysctl %s" % ",".join(gpcheck_config.sysctl_expected.keys())
        if HADOOP_HOME:
            cmd += " --hadoop %s" % HADOOP_HOME
        if options.yarn or options.yarn_ha:
            cmd += " --yarn"
        return cmd

    try:
        cmdStr = getDumpCommand()
        for host in gpcheck_info.hosts:
            if options.verbose:
                logger.info("collect data on host: %s" % host)

            cmd = Command(host, cmdStr, REMOTE, host)
            pool.addCommand(cmd)

        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running gpcheck_hostdump on '%s': %s" % (i.remoteHost, i.results.stderr.strip()))

            gpcheck_info.hosts[i.remoteHost].datafile = i.results.stdout.strip()
            
    except Exception, e:
        raise GpCheckError("Failed to collect data from servers:\n%s" % e)

    gpcheck_info.collection_end_time = time.time()


def copyFilesLocally():
    if options.verbose:
        logger.info("copy hostdump files from remote servers to master")

    try:
        for host in gpcheck_info.hosts:
            cmdStr = "scp %s:%s %s/%s.data" % (host, gpcheck_info.hosts[host].datafile, tmpdir, host)
            if options.verbose:
                logger.info(cmdStr)
            cmd = Command(host, cmdStr)
            pool.addCommand(cmd)

        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running command %s: %s" % (i.cmdStr, i.results.stderr.strip()))

    except Exception, e:
        raise GpCheckError("Failed to scp remote hostdump file to master:\n%s" % e)


def deleteRemoteFiles():
    if options.verbose:
        logger.info("delete hostdump files on remote servers")

    try:
        for host in gpcheck_info.hosts:
            cmdStr = "rm -f %s" % gpcheck_info.hosts[host].datafile
            if options.verbose:
                logger.info(cmdStr)
            cmd = Command(host, cmdStr, REMOTE, host)
            pool.addCommand(cmd)

        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running command %s: %s" % (i.cmdStr, i.results.stderr.strip()))

    except Exception, e:
        raise GpCheckError("Failed to delete remote hostdump file:\n%s" % e)


def readDataFiles():
    for host in gpcheck_info.hosts:
        fname = "%s/%s.data" % (tmpdir, host)
        try:
            with open(fname, "rb") as f:
                gpcheck_info.hosts[host].data = pickle.load(f)

        except Exception, e:
            raise GpCheckError("Failed to load pickle file '%s': %s" % (fname, e))


def readHAWQConfiguration():
    if options.verbose:
        logger.info("trying to collect HAWQ configuration...")

    dbname = os.environ.get('PGDATABASE', 'template1')
    try:
        db = pg.connect(dbname=dbname)
    except pg.InternalError, ex:
        checkFailed(None, "utility cannot perform HAWQ CPU and Memory check because failed to connect to HAWQ")
        return

    # read segment configurations
    gpcheck_info.hawq_segment_configuration = db.query("select * from gp_segment_configuration").dictresult()
    db.close()

    # read Memory GUC using hawqconfig
    command = "hawqconfig -s %s" % HAWQ_GUC_MEMORY
    p = subprocess.Popen(command, shell = True,
            stdout = subprocess.PIPE, stderr = subprocess.PIPE)
    result = p.communicate()
    match_master = re.search(r'Value		: (\d+)', result[0])

    if match_master:
        gpcheck_info.hawq_gucs[HAWQ_GUC_MEMORY] = (int(match_master.group(1)))
    else:
        checkFailed(None, "utility cannot perform HAWQ Memory check because failed to get GUC value using '%s'" % command)
        return

    gpcheck_info.hawq_collected_ok = True
    if options.verbose:
        logger.info("trying to collect HAWQ configuration [success]")


def testConnectEmc(host):
    if not host.is_a_master:
        return
    
    expected = "Running"

    if host.data.connectemc.output != expected:
        checkFailed(host.hostname, "Connect EMC is not running on master (try /etc/init.d/connectemc status)")
    


def testSolarisEtcSystem(host):
    requiredValues = {  'rlim_fd_cur' : '65536',
                        'zfs:zfs_arc_max' : '0x600000000',
                        'pcplusmp:apic_panic_on_nmi' : '1',
                        'nopanicdebug' : '1' }

    results = dict()

    for k in requiredValues.keys():
        results[k] = 0
        
    for key in host.data.etc_system.parameters.keys():

        if key not in requiredValues:
            continue

        foundValue = host.data.etc_system.parameters[key]
        if foundValue == requiredValues[key]:
            results[key] = 1

    for k in results.keys():

        if results[k]:
            continue

        checkFailed(host.hostname, "/etc/system is missing expected line 'set %s=%s'" % (k, requiredValues[k]))


def testSolarisEtcProject(host):

    requiredValues = {  'default:3::::project.max-sem-ids=(priv,1024,deny);process.max-file-descriptor=(priv,252144,deny)' : 0 }

    unexpectedValues = set(['default:3::::'])

    for line in host.data.etc_project.lines:
        if line in unexpectedValues:
            checkFailed(host.hostname, "unexpected line in /etc/project: '%s'" % line)
            continue

        if line in requiredValues:
            requiredValues[line] = 1

    for line in requiredValues.keys():
        if requiredValues[line]:
            continue

        checkFailed(host.hostname, "/etc/project is missing expected line '%s'" % line)
        

def testSolarisEtcUserAttr(host):

    requiredValues = { 'gpadmin::::defaultpriv=basic,dtrace_user,dtrace_proc' : 0 }

    for line in host.data.etc_user_attr.lines:
        if line in requiredValues:
            requiredValues[line] = 1

    for line in requiredValues.keys():
        if requiredValues[line]:
            continue

        checkFailed(host.hostname, "/etc/user_attr is missing expected line '%s'" % line)
 

def testHAWQGUC(host):
    if not gpcheck_info.hawq_collected_ok:
        return

    if options.verbose:
        logger.info("-- test HAWQ CPU/Memory Guc Settings")

    c = gpcheck_info.hawq_segment_configuration
    master_hostname = filter(lambda x: x['role'] == 'm', c)[0]['hostname']

    if host.hostname not in map(lambda x: x['hostname'], c):
        logger.warning("host '%s' is not in HAWQ array" % host.hostname)
        return

    actual_total_memory   = host.data.machine.memory_in_MB

    guc_vmemsize_master = gpcheck_info.hawq_gucs[HAWQ_GUC_MEMORY]
    # segment count on this host
    num_segments = len(filter(lambda x: x['hostname'] == host.hostname, c))

    if host.hostname == master_hostname:
        if num_segments > 1:
            checkFailed(host.hostname, "HAWQ master host has segments configured")

        if actual_total_memory < guc_vmemsize_master:
            checkFailed(host.hostname, "HAWQ master host memory size '%s' is less than the '%s' size '%s'" % (
                actual_total_memory, HAWQ_GUC_MEMORY, guc_vmemsize_master))
            return

        # check HAWQ master's memory size
        expected_vmemory_size = 8192
        if guc_vmemsize_master != expected_vmemory_size:
            checkFailed(host.hostname, "HAWQ master's %s GUC value is %s, expected %s" % (
                HAWQ_GUC_MEMORY, guc_vmemsize_master, expected_vmemory_size))

    else:
        datanode_mem = gpcheck_config.hdfs_expected["dfs.mem.datanode.heap"]

        # check HAWQ memory size
        if actual_total_memory < datanode_mem:
            checkFailed(host.hostname, "HAWQ segment's host memory size '%s' is less than the expected data node memory size '%s'" % (
                actual_total_memory, datanode_mem))
            logger.warning("please change the expected data node memory 'dfs.mem.datanode.heap' in gpcheck.cnf file")
            logger.warning("SKIP '%s' check" %(HAWQ_GUC_MEMORY))
            return
        expect_vmemsize_per_segment = 8192 
        if guc_vmemsize_master != expect_vmemsize_per_segment:
            checkFailed(host.hostname, "HAWQ segment's %s GUC value on this host is %s, expected %s" % (
                HAWQ_GUC_MEMORY, guc_vmemsize_master, expect_vmemsize_per_segment))
        

def testDiskCapacity(host):
    if options.verbose:
        logger.info("-- test Disk Capacity")

    for line in host.data.diskusage.lines:
        if len(gpcheck_config.diskusage_mounts) == 0 or line.mount in gpcheck_config.diskusage_mounts:
            actual_usage = int(line.used_percent[:-1])
            if actual_usage > gpcheck_config.diskusage_usagemax:
                checkFailed(host.hostname,
                           "potential disk full risk: %s mounted on %s has used %s space" % (
                               line.fs, line.mount, line.used_percent))
    return


def testHAWQconfig(host):
    hawq = host.data.hawq
    hdfs = host.data.hdfs
    if hawq is None:
        return # skip HAWQ test when hawq is None

    if hdfs is None:
        return # skip HAWQ test when hdfs is None

    if options.verbose:
        logger.info("-- test HAWQ config")

    if hawq.errormsg:
        checkFailed(host.hostname, "collect HAWQ configuration error: %s" % hawq.errormsg)
        return

    datanode_list = list()
    if HADOOP_HOME:
        datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME)
    is_datanode = False
    if host.hostname in datanode_list:
        is_datanode = True

    expect_config = gpcheck_config.hawq_expected

    if options.kerberos:
        expect_config.update(gpcheck_config.hawq_kerberos_expected)

    if options.yarn or options.yarn_ha:
        expect_config.update(gpcheck_config.hawq_yarn_expected)

    actual_config = hawq.site_config
    hdfs_actual_config = hdfs.site_config

    for exp_key, exp_val in expect_config.items():
        if exp_key not in actual_config:
            checkFailed(host.hostname, "HAWQ configuration missing: '%s' needs to be set to '%s'" % (exp_key, exp_val))

        else:
            actual_val = actual_config[exp_key]
            et = (exp_key, exp_val, actual_val)

            if exp_key == "dfs.block.local-path-access.user":
                if exp_val not in actual_val.split(','):
                    checkFailed(host.hostname, "HDFS configuration: '%s' should include user '%s', actual value is '%s'" % et)
            elif exp_key == "dfs.namenode.handler.count":
                if int(exp_val) > int(actual_val):
                    checkFailed(host.hostname, "HDFS configuration: '%s' should be at least '%s', actual value is '%s'" % et)
            else:
                if exp_val != actual_val:
                    checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % et)

    if not options.kerberos:
        if 'hadoop.security.authentication' in actual_config:
            if actual_config['hadoop.security.authentication'] != 'simple':
                checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % ('simple', 'hadoop.security.authentication', actual_config['hadoop.security.authentication']))

        if 'hadoop.security.authentication' in hdfs_actual_config:
            if hdfs_actual_config['hadoop.security.authentication'] != 'simple':
                checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % ('simple', 'hadoop.security.authentication', hdfs_actual_config['hadoop.security.authentication']))

    if options.yarn or options.yarn_ha:
        hawq_yarn_property_exist_list = ['hawq_rm_yarn_address', 'hawq_rm_yarn_scheduler_address', 'hawq_rm_yarn_app_name']
        for item in hawq_yarn_property_exist_list:
            if item in actual_config:
                if not actual_config[item]:
                        checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address is empty")
            else:
                checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address not defined")

    if 'dfs.client.read.shortcircuit' not in actual_config:
        checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined")

    if 'dfs.client.read.shortcircuit' not in hdfs_actual_config:
        checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined")

    if 'dfs.domain.socket.path' not in actual_config:
        checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path not defined")

    if 'dfs.domain.socket.path' not in hdfs_actual_config:
        checkFailed(host.hostname, "HDFS configuration dfs.domain.socket.path not defined")

    if is_datanode and 'dfs.domain.socket.path' in actual_config and 'dfs.domain.socket.path' in hdfs_actual_config:
        if actual_config['dfs.domain.socket.path'] != hdfs_actual_config['dfs.domain.socket.path']:
            checkFailed(host.hostname, "HAWQ configuration: dfs.domain.socket.path expect to have the same value with HDFS configuration")
        else:
            cmd = "ls -l %s" % actual_config['dfs.domain.socket.path']
            (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '')
            if result == 0:
                if output.split(' ')[0][7:9] != 'rw':
                    checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path: %s should have R/W access for both hawq and HDFS on %s" % (actual_config['dfs.domain.socket.path'], host.hostname))
            else:
                checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path: %s, does not exist on %s" % (actual_config['dfs.domain.socket.path'], host.hostname))

    if 'output.replace-datanode-on-failure' in actual_config and len(datanode_list) > 0:
        if len(datanode_list) < 4:
            if actual_config['output.replace-datanode-on-failure'] == 'true':
                checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure expect false, current is true")
        else:
            if actual_config['output.replace-datanode-on-failure'] == 'false':
                checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure expect true, current is false")
    else:
        checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure not defined")


def testDiskCapacity(host):
    if options.verbose:
        logger.info("-- test Disk Capacity")

    for line in host.data.diskusage.lines:
        if len(gpcheck_config.diskusage_mounts) == 0 or line.mount in gpcheck_config.diskusage_mounts:
            actual_usage = int(line.used_percent[:-1])
            if actual_usage > gpcheck_config.diskusage_usagemax:
                checkFailed(host.hostname,
                           "potential disk full risk: %s mounted on %s has used %s space" % (
                               line.fs, line.mount, line.used_percent))
    return


def testHDFSConfig(host):
    hdfs = host.data.hdfs
    if hdfs is None:
        return # skip HDFS test when hdfs is None

    if options.verbose:
        logger.info("-- test HDFS config")

    if hdfs.errormsg:
        checkFailed(host.hostname, "collect HDFS configuration error: %s" % hdfs.errormsg)
        return

    expect_config = gpcheck_config.hdfs_expected

    if not options.hdfs_ha and not options.kerberos:
        expect_config.update(gpcheck_config.hdfs_non_expected)

    if options.hdfs_ha and not options.kerberos:
        expect_config.update(gpcheck_config.hdfs_ha_expected)

    if options.kerberos and not options.hdfs_ha:
        expect_config.update(gpcheck_config.hdfs_kerberos_expected)

    if options.kerberos and options.hdfs_ha:
        expect_config.update(gpcheck_config.hdfs_ha_kerberos_expected)


    if options.yarn or options.yarn_ha:
        expect_config.update(gpcheck_config.yarn_expected)
        if not options.yarn_ha and not options.kerberos:
            expect_config.update(gpcheck_config.yarn_non_expected)

        if options.yarn_ha:
            expect_config.update(gpcheck_config.yarn_ha_expected)
        if options.kerberos:
            expect_config.update(gpcheck_config.yarn_kerberos_expected)

    actual_config = hdfs.site_config
    actual_heap_size = hdfs.namenode_heap_size if host.is_namenode else hdfs.datanode_heap_size

    if host.data.machine.memory_in_MB < actual_heap_size:
        checkFailed(host.hostname, "host memory size '%s' is less than the java max heap size '%s'" % (host.data.machine.memory_in_MB, actual_heap_size)) 

    # test hdfs_site.xml setting
    for exp_key, exp_val in expect_config.items():
        if exp_key.startswith("dfs.mem"):
            continue # these options belongs to memory tests

        if exp_key not in actual_config:
            checkFailed(host.hostname, "HDFS configuration missing: '%s' needs to be set to '%s'" % (exp_key, exp_val))

        else:
            actual_val = actual_config[exp_key]
            et = (exp_key, exp_val, actual_val)

            if exp_key == "dfs.block.local-path-access.user":
                if exp_val not in actual_val.split(','):
                    checkFailed(host.hostname, "HDFS configuration: '%s' should include user '%s', actual value is '%s'" % et)

            elif exp_key == "dfs.namenode.handler.count":
                if int(exp_val) > int(actual_val):
                    checkFailed(host.hostname, "HDFS configuration: '%s' should be at least '%s', actual value is '%s'" % et)

            else:
                if exp_val != actual_val:
                    checkFailed(host.hostname, "HDFS configuration: expected '%s' for '%s', actual value is '%s'" % et)

    # test hadoop memory setting
    expect_namenode_heap = expect_config["dfs.mem.namenode.heap"]
    expect_datanode_heap = expect_config["dfs.mem.datanode.heap"]

    if host.is_namenode and actual_heap_size < expect_namenode_heap:
        checkFailed(host.hostname, "Namenode Java heap size is only %sM, we recommends at least %sM" %
                                  (actual_heap_size, expect_namenode_heap))

    if not host.is_namenode and actual_heap_size < expect_datanode_heap:
        checkFailed(host.hostname, "Datanode Java heap size is only %sM, expect value is %sM" %
                                  (actual_heap_size, expect_datanode_heap))


    # Check if nodemanager direcotries exists
    directory_check_list = []
    datanode_list = list()
    if HADOOP_HOME:
        datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME)
    is_datanode = False
    if host.hostname in datanode_list:
        is_datanode = True

    if options.yarn or options.yarn_ha:
        yarn_enabled = True
    else:
        yarn_enabled = False

    if yarn_enabled and is_datanode:
        if 'yarn.nodemanager.local-dirs' in actual_config: 
            directory_check_list += actual_config['yarn.nodemanager.local-dirs'].split(',')
        else:
            checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.local-dirs not defined")

        if 'yarn.nodemanager.log-dirs' in actual_config: 
            directory_check_list += actual_config['yarn.nodemanager.log-dirs'].split(',')
        else:
            checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.log-dirs not defined")

    for directory in directory_check_list:
        cmd = "test -e %s" % directory
        (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '')
        if result != 0:
            checkFailed(host.hostname, "YARN nodemanager directory %s does not exist" % directory)

    # Check if resource manager property exists
    if options.yarn:
        yarn_property_exist_list = ['yarn.resourcemanager.address', 'yarn.resourcemanager.scheduler.address']

    if options.yarn_ha:
        yarn_property_exist_list = ['yarn.resourcemanager.hostname.rm1', 'yarn.resourcemanager.hostname.rm2']

    if yarn_enabled:
        for item in yarn_property_exist_list:
            if item in actual_config:
                if not actual_config[item]:
                        checkFailed(host.hostname, "YARN configuration: %s is empty" % item)
            else:
                checkFailed(host.hostname, "YARN configuration: %s not defined" % item)

    # Check yarn kerberos properties
    if yarn_enabled and options.kerberos:
        yarn_kerberos_check_list = ['yarn.nodemanager.keytab', 'yarn.nodemanager.principal', \
                                    'yarn.resourcemanager.keytab', 'yarn.resourcemanager.principal']
        for item in yarn_kerberos_check_list:
            if item in actual_config:
                if not actual_config[item]:
                        checkFailed(host.hostname, "YARN configuration: %s is empty, expected non-empty" % item)
            else:
                checkFailed(host.hostname, "YARN configuration missing: %s" % item)


def testIOSchedulers(host):
    if options.verbose:
        logger.info("-- test IO scheduler")

    if host.data.ioschedulers.errormsg:
        checkFailed(host.hostname, "collect IO scheduler data error: %s" % host.data.ioschedulers.errormsg)
        return

    expectedScheduler = "deadline"
    for dev in host.data.ioschedulers.devices:
        scheduler = host.data.ioschedulers.devices[dev]
        if scheduler != expectedScheduler:
            checkFailed(host.hostname,
                       "on device (%s) IO scheduler '%s' does not match expected value '%s'" % (dev, scheduler, expectedScheduler))


# perform this test only run as root
def testBlockdev(host):
    if host.data.blockdev is None:
        return

    if options.verbose:
        logger.info("-- test block device readahead value")

    expectedReadAhead = "16384"
    for dev in host.data.blockdev.ra:
        ra = host.data.blockdev.ra[dev]
        if ra != expectedReadAhead:
            checkFailed(host.hostname,
                       "on device (%s) blockdev readahead value '%s' does not match expected value '%s'" % (dev, ra, expectedReadAhead))


def testSysctl(host):
    if options.verbose:
        logger.info("-- test sysctl value")

    if host.data.sysctl.errormsg:
        checkFailed(host.hostname, "collect sysctl params error: %s" % host.data.sysctl.errormsg)
        return

    expected_values = gpcheck_config.sysctl_expected
    real_values = host.data.sysctl.variables

    # gpcheck.conf specify a lowerbound value for these params, actual value can be larger
    params_with_lowerbound = set() # sysctl params' value must be exactly the same

    for k in expected_values:
        if k in params_with_lowerbound:
            if int(real_values[k]) < int(expected_values[k]):
                checkFailed(host.hostname,
                        "sysctl value for key '%s' has value '%s', but we expect at least '%s'" % (k, real_values[k], expected_values[k]))

        elif real_values[k] != expected_values[k]: # for other params, we expect the actual value to be the same value
            checkFailed(host.hostname,
                    "sysctl value for key '%s' has value '%s' and expects '%s'" % (k, real_values[k], expected_values[k]))


def testLimitsConf(host):
    if options.verbose:
        logger.info("-- test /etc/security/limits.conf")

    if host.data.limitsconf.errormsg:
        checkFailed(host.hostname, "collect limits.conf data error: %s" % host.data.limitsconf.errormsg)
        return

    # both dict has the form: (type, item) => value
    expect_data = gpcheck_config.limits_expected
    actual_data = dict([((e.type, e.item), e.value) for e in host.data.limitsconf.lines if e.domain in ("gpadmin", "*")])
    expect_keyset = set(expect_data.keys())
    actual_keyset = set(actual_data.keys())

    for key in expect_keyset.intersection(actual_keyset):
        expect_val = int(expect_data[key])
        actual_val = int(actual_data[key])
        if actual_val < expect_val:
            checkFailed(host.hostname,
                       "%s in  /etc/security/limits.conf has value %d lower than expected value %d" % (
                            " ".join(key), actual_val, expect_val))

    for key in expect_keyset.difference(actual_keyset):
        checkFailed(host.hostname,
                   "%s not found in /etc/security/limits.conf" % " ".join(key))


def testLinuxMounts(host):
    if options.verbose:
        logger.info("-- test mount points")

    expected_mount_points = gpcheck_config.mount_points
    actual_mount_points = set([m.dir for m in host.data.mounts.entries.values()])

    if len(expected_mount_points) == 0:
        if options.verbose:
            logger.info("-- you didn't specify any mount points to be check in %s, ignore this test" % GPCHECK_CONFIG_FILE)
        return

    if not actual_mount_points.issuperset(expected_mount_points):
        for failed_mount in expected_mount_points.difference(actual_mount_points):
            checkFailed(host.hostname, "%s is not mounted" % failed_mount) 


def testNtp(host):
    if options.verbose:
        logger.info("-- test NTP")

    if host.data.ntp.currenttime < (gpcheck_info.collection_start_time - 1):
        checkFailed(host.hostname, "potential NTPD issue.  gpcheck start time (%s) time on machine (%s)" % (time.ctime(gpcheck_info.collection_start_time), time.ctime(host.data.ntp.currenttime)))
    if host.data.ntp.currenttime > (gpcheck_info.collection_end_time + 1):
        checkFailed(host.hostname, "potential NTPD issue.  gpcheck end time (%s) time on machine (%s)" % (time.ctime(gpcheck_info.collection_start_time), time.ctime(host.data.ntp.currenttime)))
    if not host.data.ntp.running:
        checkFailed(host.hostname, "ntpd not detected on machine")


def testGenericLinuxHost(host):
    logger.info("test on host: %s" % host.hostname)
    if host.is_namenode:
        testHAWQGUC(host)
        testHAWQconfig(host)
        testHDFSConfig(host)
        testDiskCapacity(host)
        testSysctl(host)
        testLimitsConf(host)
        testLinuxMounts(host)
        testNtp(host)

    else:
        testHAWQGUC(host)
        testHAWQconfig(host)
        testDiskCapacity(host)
        testHDFSConfig(host)
        testIOSchedulers(host)
        testSysctl(host)
        testLimitsConf(host)
        testLinuxMounts(host)
        testNtp(host)


def testGenericSolarisHost(host):
    testSolarisEtcSystem(host)
    testSolarisEtcProject(host)
    testSolarisEtcUserAttr(host)

def testUnameConsistency():
    logger.info("test uname consistency")
    firstUname = None
    firstHost = None
    for _, host in gpcheck_info.hosts.items():
        uname = host.data.uname.output
        if firstUname:
            if firstUname != uname:
                checkFailed(h, "uname -r output different among hosts: %s : %s != %s : %s" % (firstHost, firstUname, host.hostname, uname))
        else:
            firstUname = uname
            firstHost = host.hostname


def testGenericLinuxCluster():
    for _, host in gpcheck_info.hosts.items():
        testGenericLinuxHost(host)
    testUnameConsistency()

def testGenericLinuxClusterBlockDev():
    for _, host in gpcheck_info.hosts.items():
        if not host.is_namenode:
            testBlockdev(host)

def testGenericSolarisCluster():
    for _, host in gpcheck_info.hosts.items():
        testGenericSolarisHost(host)
    testUnameConsistency()


def runTests():
    if gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_LINUX:
        testGenericLinuxCluster()
        if gpcheck_info.is_root:
            testGenericLinuxClusterBlockDev()

    elif gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_SOLARIS:
        testGenericSolarisCluster()

    else:
        raise GpCheckError("No tests exist for this platform in gpcheck")

    # report checks result
    logger.info("GPCHECK Result:")
    logger.info("---------------------------------------")
    if found_errors:
        logger.info("check failed!\tfound %s error(s)" % found_errors)
    else:
        logger.info("all check succeed!")
    logger.info("---------------------------------------")


def readZip():
    logger.info("trying to read zip file '%s'..." % options.zipin)

    words = options.zipin.split(".tar.gz")
    if len(words) != 2:
        raise GpCheckError("--zipin file needs to be a .tar.gz file")
    fname = words[0]

    # untar
    cmdStr = "tar xfz %s" % (options.zipin)
    if options.verbose:
        logger.info(cmdStr)
    try:
        cmd = Command("tarcmd", cmdStr)
        pool.addCommand(cmd)
        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running command '%s'" % cmdStr)

    except Exception, e:
        raise GpCheckError("Failed to extract tar file '%s': %s" % (options.zipin, e))

    # move extracted file to temp directory
    newfname = "%s/%s" % (tmpdir, fname)
    cmdStr = "mv %s %s" % (fname, newfname)
    if options.verbose:
        logger.info(cmdStr)
    try:
        cmd = Command("mvcmd", cmdStr)
        pool.addCommand(cmd)
        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running command '%s'" % cmdStr)

    except Exception, e:
        raise GpCheckError("Failed to move file '%s' to temp directory: %s" % (fname, e))

    # load pickle file
    global gpcheck_info
    try:
        with open(newfname, "rb") as f:
            gpcheck_info = pickle.load(f)

    except Exception, e:
        raise GpCheckError("Failed to load pickle file '%s': %s" % (newfname, e))

    logger.info("trying to read zip file '%s' [success]" % options.zipin)


def doZip(fname):
    logger.info("dump gpcheck data into a zip file '%s.tar.gz'..." % fname)

    # dump to pickle file
    try:
        with open(fname, "wb") as f:
            pickle.dump(gpcheck_info, f)

    except Exception, e:
        raise GpCheckError("Failed to dump pickle file '%s':\n%s" % (fname, e))

    # make a tar ball
    cmdStr = "tar cfz %s.tar.gz %s" % (fname, fname)
    if options.verbose:
        logger.info(cmdStr)
    try:
        cmd = Command("tarcmd", cmdStr)
        pool.addCommand(cmd)
        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running command '%s': %s" % (cmdStr, i.results.stderr.strip()))

    except Exception, e:
        raise GpCheckError("Failed to dump gpcheck data into a zip file:\n%s" % e)

    # delete pickle file
    cmdStr = "rm -rf %s" % fname
    if options.verbose:
        logger.info(cmdStr)
    try:
        cmd = Command("rmcmd", cmdStr)
        pool.addCommand(cmd)
        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc or i.results.halt or not i.results.completed:
                raise Exception("error running command '%s': %s" % (cmdStr, i.results.stderr.strip()))

    except Exception, e:
        raise GpCheckError("Failed to delete pickle file '%s':\n%s" % (fname, e))

    logger.info("dump gpcheck data into a zip file '%s.tar.gz' [success]" % fname)


def doPrint():
    for h in sorted(gpcheck_info.hosts):
        print "HOST: %s" % h
        print gpcheck_info.hosts[h].data
        print "----------------------------------------------------------------------\n"

    if gpcheck_info.hawq_collected_ok:
        print "HAWQ guc settings:"
        for guc_name, guc_val in gpcheck_info.hawq_gucs.items():
            print "GUC          : %s\nMaster  value: %s\nSegment value: %s\n" % (guc_name, guc_val[0], guc_val[1])


if __name__ == '__main__':

    if gpcheck_info.is_root:
        logger.info("gpcheck will perform block device's readahead checks when run as root")
    try:
        try:
            checkPlatform()
            parseargs()
            readConfigFile()

        except GpCheckError, e:
            logger.error(str(e))
            sys.exit(1)
            if pool:
                pool.join()
                pool.haltWork()
                pool.joinWorkers()

        try:
            tmpdir = tempfile.mkdtemp(prefix='gpcheck')
        except Exception, e:
            logger.error("Error creating tmp dir on master: %s" % e)
            sys.exit(1)

        try:
            # Phase 1: collect input
            if options.zipin:
                readZip() # load information into gpcheck_info from zip
            else:
                # read host info into gpcheck_info.hosts from --file or --host
                createHostList()
                # collect each server's system environment configuration
                runCollections()
                # read collected data into gpcheck_info
                readDataFiles()
                # read HAWQ configuration
                readHAWQConfiguration()

            # Phase 2: generate output
            if options.stdout:
                doPrint()
            elif options.zipout:
                doZip("./gpcheck_%s" % time.time())
            else:
                runTests()
                if found_errors:
                    sys.exit(1)

        except GpCheckError, e:
            logger.error(str(e))
            sys.exit(1)

        finally:
            logger.info("Clean up...")
            try:
                if tmpdir:
                    shutil.rmtree(tmpdir)
            except Exception, e:
                logger.error("error removing tempdir during job cleanup: %s" % e)
    finally:
        if pool:
            pool.join()
            pool.haltWork()
            pool.joinWorkers()
