Add support for running autoserv with a "--collect-crashinfo" flag

that tells autotest to run a job that includes ONLY the crashinfo collection. This will pull back crashinfo, and if run against a results directory from a crashed job it will also pull back any client results that it can find on the remote host. Risk: Low Visibility: Adds a new mode to autoserv for just doing crashinfo collection. Signed-off-by: John Admanski <jadmanski@google.com> git-svn-id: svn://test.kernel.org/autotest/trunk@2933 592f7852-d20e-0410-864c-8624ca9c26a4
author: jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4> 2009-03-25 20:07:10 +0000
committer: jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4> 2009-03-25 20:07:10 +0000
commit: 9b5ace430a54607f50fe694abf9eceaeffc626b3 (patch)
tree: 5f232ae06f4e94cf73a58ee108ac958a4d9f4d03 /server
parent: f6a98e7ae84948de502948bd360b330c41046e5b (diff)
5 files changed, 67 insertions, 39 deletions
diff --git a/server/autoserv b/server/autoserv
index 0f228661..00fd5f44 100755
--- a/server/autoserv
+++ b/server/autoserv
@@ -61,13 +61,15 @@ def run_autoserv(pid_file_manager, results, parser):
     ssh_user = parser.options.ssh_user
     ssh_port = parser.options.ssh_port
     ssh_pass = parser.options.ssh_pass
+    collect_crashinfo = parser.options.collect_crashinfo
 
     # can't be both a client and a server side test
     if client and server:
         print "Can not specify a test as both server and client!"
         sys.exit(1)
 
-    if len(parser.args) < 1 and not (verify or repair or cleanup):
+    if len(parser.args) < 1 and not (verify or repair or cleanup
+                                     or collect_crashinfo):
         print parser.parser.print_help()
         sys.exit(1)
 
@@ -121,7 +123,8 @@ def run_autoserv(pid_file_manager, results, parser):
             job.verify()
         else:
             try:
-                job.run(cleanup, install_before, install_after)
+                job.run(cleanup, install_before, install_after,
+                        only_collect_crashinfo=collect_crashinfo)
             finally:
                 while job.hosts:
                     host = job.hosts.pop()
@@ -152,7 +155,8 @@ def main():
         if not results:
             results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
         results  = os.path.abspath(results)
-        if os.path.exists(os.path.join(results, 'control.srv')):
+        resultdir_exists = os.path.exists(os.path.join(results, 'control.srv'))
+        if not parser.options.collect_crashinfo and resultdir_exists:
             error = "Error: results directory already exists: %s\n" % results
             sys.stderr.write(error)
             sys.exit(1)
diff --git a/server/autoserv_parser.py b/server/autoserv_parser.py
index 7fe8fa85..5e43e792 100644
--- a/server/autoserv_parser.py
+++ b/server/autoserv_parser.py
@@ -100,6 +100,9 @@ class base_autoserv_parser(object):
                                dest="install_in_tmpdir", default=False,
                                help=("by default install autotest clients in "
                                      "a temporary directory"))
+        self.parser.add_option("--collect-crashinfo", action="store_true",
+                               dest="collect_crashinfo", default=False,
+                               help="just run crashinfo collection")
 
 
     def parse_args(self):
diff --git a/server/hosts/remote.py b/server/hosts/remote.py
index b0fb8bc9..2c4a79f1 100644
--- a/server/hosts/remote.py
+++ b/server/hosts/remote.py
@@ -1,7 +1,7 @@
 """This class defines the Remote host class, mixing in the SiteHost class
 if it is available."""
 
-import os, time
+import os, time, pickle, logging
 from autotest_lib.client.common_lib import error
 from autotest_lib.server import utils, profiler
 from autotest_lib.server.hosts import base_classes, bootloader
@@ -201,18 +201,19 @@ class RemoteHost(base_classes.Host):
 
 
     def get_crashinfo(self, test_start_time):
-        print "Collecting crash information..."
+        logging.info("Collecting crash information...")
         super(RemoteHost, self).get_crashinfo(test_start_time)
 
         # wait for four hours, to see if the machine comes back up
         current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
-        print "Waiting four hours for %s to come up (%s)" % (self.hostname,
-                                                             current_time)
+        logging.info("Waiting four hours for %s to come up (%s)",
+                    self.hostname, current_time)
         if not self.wait_up(timeout=4*60*60):
-            print "%s down, unable to collect crash info" % self.hostname
+            logging.warning("%s down, unable to collect crash info",
+                           self.hostname)
             return
         else:
-            print "%s is back up, collecting crash info" % self.hostname
+            logging.info("%s is back up, collecting crash info", self.hostname)
 
         # find a directory to put the crashinfo into
         if self.job:
@@ -226,26 +227,26 @@ class RemoteHost(base_classes.Host):
         # collect various log files
         log_files = ["/var/log/messages", "/var/log/monitor-ssh-reboots"]
         for log in log_files:
-            print "Collecting %s..." % log
+            logging.info("Collecting %s...", log)
             try:
                 self.get_file(log, infodir)
             except Exception:
-                print "Collection of %s failed. Non-fatal, continuing." % log
+                logging.warning("Collection of %s failed", log)
 
         # collect dmesg
-        print "Collecting dmesg (saved to crashinfo/dmesg)..."
+        logging.info("Collecting dmesg (saved to crashinfo/dmesg)...")
         devnull = open("/dev/null", "w")
         try:
             try:
                 result = self.run("dmesg", stdout_tee=devnull).stdout
                 file(os.path.join(infodir, "dmesg"), "w").write(result)
             except Exception, e:
-                print "crashinfo collection of dmesg failed with:\n%s" % e
+                logging.warning("Collection of dmesg failed:\n%s", e)
         finally:
             devnull.close()
 
         # collect any profiler data we can find
-        print "Collecting any server-side profiler data lying around..."
+        logging.info("Collecting any server-side profiler data lying around...")
         try:
             cmd = "ls %s" % profiler.PROFILER_TMPDIR
             profiler_dirs = [path for path in self.run(cmd).stdout.split()
@@ -260,7 +261,21 @@ class RemoteHost(base_classes.Host):
                 os.mkdir(local_path)
                 self.get_file(remote_path + "/", local_path)
         except Exception, e:
-            print "crashinfo collection of profiler data failed with:\n%s" % e
+            logging.warning("Collection of profiler data failed with:\n%s", e)
+
+
+        # collect any uncollected logs we see (for this host)
+        if self.job and os.path.exists(self.job.uncollected_log_file):
+            try:
+                logs = pickle.load(open(self.job.uncollected_log_file))
+                for hostname, remote_path, local_path in logs:
+                    if hostname == self.hostname:
+                        logging.info("Retrieving logs from %s:%s into %s",
+                                    hostname, remote_path, local_path)
+                        self.get_file(remote_path + "/", local_path + "/")
+            except Exception, e:
+                logging.warning("Error while trying to collect stranded "
+                               "Autotest client logs: %s", e)
 
 
     def are_wait_up_processes_up(self):
diff --git a/server/server_job.py b/server/server_job.py
index 98e4ed19..05bd19fb 100755
--- a/server/server_job.py
+++ b/server/server_job.py
@@ -104,9 +104,10 @@ class base_server_job(object):
         if resultdir:
             if not os.path.exists(resultdir):
                 os.mkdir(resultdir)
-            log_file = open(self.uncollected_log_file, "w")
-            pickle.dump([], log_file)
-            log_file.close()
+            if not os.path.exists(self.uncollected_log_file):
+                log_file = open(self.uncollected_log_file, "w")
+                pickle.dump([], log_file)
+                log_file.close()
             if not os.path.exists(self.debugdir):
                 os.mkdir(self.debugdir)
             status_log = self.get_status_log_path()
@@ -360,7 +361,7 @@ class base_server_job(object):
     USE_TEMP_DIR = object()
     def run(self, cleanup=False, install_before=False, install_after=False,
             collect_crashdumps=True, namespace={}, control=None,
-            control_file_dir=None):
+            control_file_dir=None, only_collect_crashinfo=False):
         # use a copy so changes don't affect the original dictionary
         namespace = namespace.copy()
         machines = self.machines
@@ -390,27 +391,31 @@ class base_server_job(object):
             if install_before and machines:
                 self._execute_code(INSTALL_CONTROL_FILE, namespace)
 
-            # determine the dir to write the control files to
-            if control_file_dir and control_file_dir is not self.USE_TEMP_DIR:
-                temp_control_file_dir = None
-            else:
-                temp_control_file_dir = control_file_dir = tempfile.mkdtemp(
-                    suffix='temp_control_file_dir')
-            server_control_file = os.path.join(control_file_dir,
-                                               SERVER_CONTROL_FILENAME)
-            client_control_file = os.path.join(control_file_dir,
-                                               CLIENT_CONTROL_FILENAME)
-            if self.client:
-                namespace['control'] = control
-                utils.open_write_close(client_control_file, control)
-                shutil.copy(CLIENT_WRAPPER_CONTROL_FILE, server_control_file)
-            else:
-                namespace['utils'] = utils
-                utils.open_write_close(server_control_file, control)
-            self._execute_code(server_control_file, namespace)
+            if not only_collect_crashinfo:
+                # determine the dir to write the control files to
+                cfd_specified = (control_file_dir
+                                 and control_file_dir is not self.USE_TEMP_DIR)
+                if cfd_specified:
+                    temp_control_file_dir = None
+                else:
+                    temp_control_file_dir = tempfile.mkdtemp(
+                        suffix='temp_control_file_dir')
+                    control_file_dir = temp_control_file_dir
+                server_control_file = os.path.join(control_file_dir,
+                                                   SERVER_CONTROL_FILENAME)
+                client_control_file = os.path.join(control_file_dir,
+                                                   CLIENT_CONTROL_FILENAME)
+                if self.client:
+                    namespace['control'] = control
+                    utils.open_write_close(client_control_file, control)
+                    shutil.copy(CLIENT_WRAPPER_CONTROL_FILE,
+                                server_control_file)
+                else:
+                    utils.open_write_close(server_control_file, control)
+                self._execute_code(server_control_file, namespace)
 
-            # disable crashinfo collection if we get this far without error
-            collect_crashinfo = False
+                # no error occured, so we don't need to collect crashinfo
+                collect_crashinfo = False
         finally:
             if temp_control_file_dir:
                 # Clean up temp directory used for copies of the control files
diff --git a/server/server_job_unittest.py b/server/server_job_unittest.py
index 4411b7cf..a553d6a3 100644
--- a/server/server_job_unittest.py
+++ b/server/server_job_unittest.py
@@ -59,6 +59,7 @@ class CopyLogsTest(unittest.TestCase):
         os.path.exists.expect_call(
                 mock.is_string_comparator()).and_return(False)
         os.mkdir.expect_call(mock.is_string_comparator())
+        os.path.exists.expect_call(self.uncollected).and_return(False)
         server_job.open.expect_call(self.uncollected, 'w').and_return(file_obj)
         pickle.dump.expect_call([], file_obj)
         file_obj.close.expect_call()
author	jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4>	2009-03-25 20:07:10 +0000
committer	jadmanski <jadmanski@592f7852-d20e-0410-864c-8624ca9c26a4>	2009-03-25 20:07:10 +0000
commit	9b5ace430a54607f50fe694abf9eceaeffc626b3 (patch)
tree	5f232ae06f4e94cf73a58ee108ac958a4d9f4d03 /server
parent	f6a98e7ae84948de502948bd360b330c41046e5b (diff)