diff options
author | jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4> | 2009-03-25 20:07:10 +0000 |
---|---|---|
committer | jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4> | 2009-03-25 20:07:10 +0000 |
commit | 9b5ace430a54607f50fe694abf9eceaeffc626b3 (patch) | |
tree | 5f232ae06f4e94cf73a58ee108ac958a4d9f4d03 /server | |
parent | f6a98e7ae84948de502948bd360b330c41046e5b (diff) |
Add support for running autoserv with a "--collect-crashinfo" flag
that tells autotest to run a job that includes ONLY the crashinfo
collection. This will pull back crashinfo, and if run against a
results directory from a crashed job it will also pull back any
client results that it can find on the remote host.
Risk: Low
Visibility: Adds a new mode to autoserv for just doing crashinfo
collection.
Signed-off-by: John Admanski <jadmanski@google.com>
git-svn-id: svn://test.kernel.org/autotest/trunk@2933 592f7852-d20e-0410-864c-8624ca9c26a4
Diffstat (limited to 'server')
-rwxr-xr-x | server/autoserv | 10 | ||||
-rw-r--r-- | server/autoserv_parser.py | 3 | ||||
-rw-r--r-- | server/hosts/remote.py | 39 | ||||
-rwxr-xr-x | server/server_job.py | 53 | ||||
-rw-r--r-- | server/server_job_unittest.py | 1 |
5 files changed, 67 insertions, 39 deletions
diff --git a/server/autoserv b/server/autoserv index 0f228661..00fd5f44 100755 --- a/server/autoserv +++ b/server/autoserv @@ -61,13 +61,15 @@ def run_autoserv(pid_file_manager, results, parser): ssh_user = parser.options.ssh_user ssh_port = parser.options.ssh_port ssh_pass = parser.options.ssh_pass + collect_crashinfo = parser.options.collect_crashinfo # can't be both a client and a server side test if client and server: print "Can not specify a test as both server and client!" sys.exit(1) - if len(parser.args) < 1 and not (verify or repair or cleanup): + if len(parser.args) < 1 and not (verify or repair or cleanup + or collect_crashinfo): print parser.parser.print_help() sys.exit(1) @@ -121,7 +123,8 @@ def run_autoserv(pid_file_manager, results, parser): job.verify() else: try: - job.run(cleanup, install_before, install_after) + job.run(cleanup, install_before, install_after, + only_collect_crashinfo=collect_crashinfo) finally: while job.hosts: host = job.hosts.pop() @@ -152,7 +155,8 @@ def main(): if not results: results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S') results = os.path.abspath(results) - if os.path.exists(os.path.join(results, 'control.srv')): + resultdir_exists = os.path.exists(os.path.join(results, 'control.srv')) + if not parser.options.collect_crashinfo and resultdir_exists: error = "Error: results directory already exists: %s\n" % results sys.stderr.write(error) sys.exit(1) diff --git a/server/autoserv_parser.py b/server/autoserv_parser.py index 7fe8fa85..5e43e792 100644 --- a/server/autoserv_parser.py +++ b/server/autoserv_parser.py @@ -100,6 +100,9 @@ class base_autoserv_parser(object): dest="install_in_tmpdir", default=False, help=("by default install autotest clients in " "a temporary directory")) + self.parser.add_option("--collect-crashinfo", action="store_true", + dest="collect_crashinfo", default=False, + help="just run crashinfo collection") def parse_args(self): diff --git a/server/hosts/remote.py b/server/hosts/remote.py index b0fb8bc9..2c4a79f1 100644 --- a/server/hosts/remote.py +++ b/server/hosts/remote.py @@ -1,7 +1,7 @@ """This class defines the Remote host class, mixing in the SiteHost class if it is available.""" -import os, time +import os, time, pickle, logging from autotest_lib.client.common_lib import error from autotest_lib.server import utils, profiler from autotest_lib.server.hosts import base_classes, bootloader @@ -201,18 +201,19 @@ class RemoteHost(base_classes.Host): def get_crashinfo(self, test_start_time): - print "Collecting crash information..." + logging.info("Collecting crash information...") super(RemoteHost, self).get_crashinfo(test_start_time) # wait for four hours, to see if the machine comes back up current_time = time.strftime("%b %d %H:%M:%S", time.localtime()) - print "Waiting four hours for %s to come up (%s)" % (self.hostname, - current_time) + logging.info("Waiting four hours for %s to come up (%s)", + self.hostname, current_time) if not self.wait_up(timeout=4*60*60): - print "%s down, unable to collect crash info" % self.hostname + logging.warning("%s down, unable to collect crash info", + self.hostname) return else: - print "%s is back up, collecting crash info" % self.hostname + logging.info("%s is back up, collecting crash info", self.hostname) # find a directory to put the crashinfo into if self.job: @@ -226,26 +227,26 @@ class RemoteHost(base_classes.Host): # collect various log files log_files = ["/var/log/messages", "/var/log/monitor-ssh-reboots"] for log in log_files: - print "Collecting %s..." % log + logging.info("Collecting %s...", log) try: self.get_file(log, infodir) except Exception: - print "Collection of %s failed. Non-fatal, continuing." % log + logging.warning("Collection of %s failed", log) # collect dmesg - print "Collecting dmesg (saved to crashinfo/dmesg)..." + logging.info("Collecting dmesg (saved to crashinfo/dmesg)...") devnull = open("/dev/null", "w") try: try: result = self.run("dmesg", stdout_tee=devnull).stdout file(os.path.join(infodir, "dmesg"), "w").write(result) except Exception, e: - print "crashinfo collection of dmesg failed with:\n%s" % e + logging.warning("Collection of dmesg failed:\n%s", e) finally: devnull.close() # collect any profiler data we can find - print "Collecting any server-side profiler data lying around..." + logging.info("Collecting any server-side profiler data lying around...") try: cmd = "ls %s" % profiler.PROFILER_TMPDIR profiler_dirs = [path for path in self.run(cmd).stdout.split() @@ -260,7 +261,21 @@ class RemoteHost(base_classes.Host): os.mkdir(local_path) self.get_file(remote_path + "/", local_path) except Exception, e: - print "crashinfo collection of profiler data failed with:\n%s" % e + logging.warning("Collection of profiler data failed with:\n%s", e) + + + # collect any uncollected logs we see (for this host) + if self.job and os.path.exists(self.job.uncollected_log_file): + try: + logs = pickle.load(open(self.job.uncollected_log_file)) + for hostname, remote_path, local_path in logs: + if hostname == self.hostname: + logging.info("Retrieving logs from %s:%s into %s", + hostname, remote_path, local_path) + self.get_file(remote_path + "/", local_path + "/") + except Exception, e: + logging.warning("Error while trying to collect stranded " + "Autotest client logs: %s", e) def are_wait_up_processes_up(self): diff --git a/server/server_job.py b/server/server_job.py index 98e4ed19..05bd19fb 100755 --- a/server/server_job.py +++ b/server/server_job.py @@ -104,9 +104,10 @@ class base_server_job(object): if resultdir: if not os.path.exists(resultdir): os.mkdir(resultdir) - log_file = open(self.uncollected_log_file, "w") - pickle.dump([], log_file) - log_file.close() + if not os.path.exists(self.uncollected_log_file): + log_file = open(self.uncollected_log_file, "w") + pickle.dump([], log_file) + log_file.close() if not os.path.exists(self.debugdir): os.mkdir(self.debugdir) status_log = self.get_status_log_path() @@ -360,7 +361,7 @@ class base_server_job(object): USE_TEMP_DIR = object() def run(self, cleanup=False, install_before=False, install_after=False, collect_crashdumps=True, namespace={}, control=None, - control_file_dir=None): + control_file_dir=None, only_collect_crashinfo=False): # use a copy so changes don't affect the original dictionary namespace = namespace.copy() machines = self.machines @@ -390,27 +391,31 @@ class base_server_job(object): if install_before and machines: self._execute_code(INSTALL_CONTROL_FILE, namespace) - # determine the dir to write the control files to - if control_file_dir and control_file_dir is not self.USE_TEMP_DIR: - temp_control_file_dir = None - else: - temp_control_file_dir = control_file_dir = tempfile.mkdtemp( - suffix='temp_control_file_dir') - server_control_file = os.path.join(control_file_dir, - SERVER_CONTROL_FILENAME) - client_control_file = os.path.join(control_file_dir, - CLIENT_CONTROL_FILENAME) - if self.client: - namespace['control'] = control - utils.open_write_close(client_control_file, control) - shutil.copy(CLIENT_WRAPPER_CONTROL_FILE, server_control_file) - else: - namespace['utils'] = utils - utils.open_write_close(server_control_file, control) - self._execute_code(server_control_file, namespace) + if not only_collect_crashinfo: + # determine the dir to write the control files to + cfd_specified = (control_file_dir + and control_file_dir is not self.USE_TEMP_DIR) + if cfd_specified: + temp_control_file_dir = None + else: + temp_control_file_dir = tempfile.mkdtemp( + suffix='temp_control_file_dir') + control_file_dir = temp_control_file_dir + server_control_file = os.path.join(control_file_dir, + SERVER_CONTROL_FILENAME) + client_control_file = os.path.join(control_file_dir, + CLIENT_CONTROL_FILENAME) + if self.client: + namespace['control'] = control + utils.open_write_close(client_control_file, control) + shutil.copy(CLIENT_WRAPPER_CONTROL_FILE, + server_control_file) + else: + utils.open_write_close(server_control_file, control) + self._execute_code(server_control_file, namespace) - # disable crashinfo collection if we get this far without error - collect_crashinfo = False + # no error occured, so we don't need to collect crashinfo + collect_crashinfo = False finally: if temp_control_file_dir: # Clean up temp directory used for copies of the control files diff --git a/server/server_job_unittest.py b/server/server_job_unittest.py index 4411b7cf..a553d6a3 100644 --- a/server/server_job_unittest.py +++ b/server/server_job_unittest.py @@ -59,6 +59,7 @@ class CopyLogsTest(unittest.TestCase): os.path.exists.expect_call( mock.is_string_comparator()).and_return(False) os.mkdir.expect_call(mock.is_string_comparator()) + os.path.exists.expect_call(self.uncollected).and_return(False) server_job.open.expect_call(self.uncollected, 'w').and_return(file_obj) pickle.dump.expect_call([], file_obj) file_obj.close.expect_call() |