[Branch,~linaro-validation/lava-scheduler/trunk] Rev 257: Neil Williams 2013-09-02 Ensure there is an actual device before trying

Message ID 20130903110541.17878.98750.launchpad@ackee.canonical.com
State Accepted
Headers show

Commit Message

Neil Williams Sept. 3, 2013, 11:05 a.m.
Merge authors:
  Neil Williams (codehelp)
Related merge proposals:
  https://code.launchpad.net/~codehelp/lava-scheduler/reserved-boards/+merge/183496
  proposed by: Neil Williams (codehelp)
  review: Approve - Antonio Terceiro (terceiro)
------------------------------------------------------------
revno: 257 [merge]
committer: Neil Williams <neil.williams@linaro.org>
branch nick: lava-scheduler
timestamp: Tue 2013-09-03 12:03:25 +0100
message:
  Neil Williams 2013-09-02 Ensure there is an actual device before trying 
   to check it.
  Neil Williams 2013-09-02 Ensure the device transitions to Running
   once the job (single node or multi node) starts to run.
  Neil Williams 2013-09-02 Add a real user to the transition.
  Neil Williams 2013-09-02 [merge] Add Senthil's original change to make it
   easier to deploy.
  Neil Williams 2013-09-02 Add a new RESERVED device status and stop _fix_device
   from overloading RUNNING for the period after submission
   but before a MultiNode job is running.
modified:
  lava_scheduler_app/admin.py
  lava_scheduler_app/api.py
  lava_scheduler_app/models.py
  lava_scheduler_app/views.py
  lava_scheduler_daemon/dbjobsource.py
  lava_scheduler_daemon/service.py


--
lp:lava-scheduler
https://code.launchpad.net/~linaro-validation/lava-scheduler/trunk

You are subscribed to branch lp:lava-scheduler.
To unsubscribe from this branch go to https://code.launchpad.net/~linaro-validation/lava-scheduler/trunk/+edit-subscription

Patch

=== modified file 'lava_scheduler_app/admin.py'
--- lava_scheduler_app/admin.py	2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/admin.py	2013-09-02 15:14:15 +0000
@@ -8,7 +8,7 @@ 
 
 
 def offline_action(modeladmin, request, queryset):
-    for device in queryset.filter(status__in=[Device.IDLE, Device.RUNNING]):
+    for device in queryset.filter(status__in=[Device.IDLE, Device.RUNNING, Device.RESERVED]):
         if device.can_admin(request.user):
             device.put_into_maintenance_mode(request.user, "admin action")
 offline_action.short_description = "take offline"

=== modified file 'lava_scheduler_app/api.py'
--- lava_scheduler_app/api.py	2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/api.py	2013-09-02 15:14:15 +0000
@@ -2,7 +2,6 @@ 
 from simplejson import JSONDecodeError
 from django.db.models import Count
 from linaro_django_xmlrpc.models import ExposedAPI
-from lava_scheduler_app import utils
 from lava_scheduler_app.models import (
     Device,
     DeviceType,
@@ -165,8 +164,8 @@ 
             .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),
                       offline=SumIf('device', condition='status in (%s,%s)'
                                                         % (Device.OFFLINE, Device.OFFLINING)),
-                      busy=SumIf('device', condition='status=%s'
-                                                     % Device.RUNNING), ).order_by('name')
+                      busy=SumIf('device', condition='status in (%s,%s)'
+                                                     % (Device.RUNNING, Device.RESERVED)), ).order_by('name')
 
         for dev_type in device_types:
             device_type = {}

=== modified file 'lava_scheduler_app/models.py'
--- lava_scheduler_app/models.py	2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/models.py	2013-09-02 15:42:27 +0000
@@ -51,19 +51,20 @@ 
 
 
 def check_device_availability(requested_devices):
-    """Checks whether the number of devices requested is available.
+    """Checks whether the number of devices requested is available for a multinode job.
     
     See utils.requested_device_count() for details of REQUESTED_DEVICES
     dictionary format.
 
-    Returns True if the requested number of devices are available, else
-    raises DevicesUnavailableException.
+    Returns True for singlenode or if the requested number of devices are available
+    for the multinode job, else raises DevicesUnavailableException.
     """
     device_types = DeviceType.objects.values_list('name').filter(
-        models.Q(device__status=Device.IDLE) | \
-            models.Q(device__status=Device.RUNNING)
+        models.Q(device__status=Device.IDLE) |
+        models.Q(device__status=Device.RUNNING) |
+        models.Q(device__status=Device.RESERVED)
         ).annotate(
-        num_count=models.Count('name')
+            num_count=models.Count('name')
         ).order_by('name')
 
     if requested_devices:
@@ -115,6 +116,7 @@ 
     RUNNING = 2
     OFFLINING = 3
     RETIRED = 4
+    RESERVED = 5
 
     STATUS_CHOICES = (
         (OFFLINE, 'Offline'),
@@ -122,6 +124,7 @@ 
         (RUNNING, 'Running'),
         (OFFLINING, 'Going offline'),
         (RETIRED, 'Retired'),
+        (RESERVED, 'Reserved')
     )
 
     # A device health shows a device is ready to test or not
@@ -201,7 +204,7 @@ 
         return user.has_perm('lava_scheduler_app.change_device')
 
     def put_into_maintenance_mode(self, user, reason):
-        if self.status in [self.RUNNING, self.OFFLINING]:
+        if self.status in [self.RUNNING, self.RESERVED, self.OFFLINING]:
             new_status = self.OFFLINING
         else:
             new_status = self.OFFLINE
@@ -236,6 +239,16 @@ 
         self.health_status = Device.HEALTH_LOOPING
         self.save()
 
+    def cancel_reserved_status(self, user, reason):
+        if self.status != Device.RESERVED:
+            return
+        new_status = self.IDLE
+        DeviceStateTransition.objects.create(
+            created_by=user, device=self, old_state=self.status,
+            new_state=new_status, message=reason, job=None).save()
+        self.status = new_status
+        self.save()
+
 
 class JobFailureTag(models.Model):
     """
@@ -324,7 +337,7 @@ 
 
     tags = models.ManyToManyField(Tag, blank=True)
 
-    # This is set once the job starts.
+    # This is set once the job starts or is reserved.
     actual_device = models.ForeignKey(
         Device, null=True, default=None, related_name='+', blank=True)
 
@@ -598,6 +611,10 @@ 
         return self._can_admin(user) and self.status in states
 
     def cancel(self):
+        # if SUBMITTED with actual_device - clear the actual_device back to idle.
+        if self.status == TestJob.SUBMITTED and self.actual_device is not None:
+            device = Device.objects.get(hostname=self.actual_device)
+            device.cancel_reserved_status(self.submitter, "multinode-cancel")
         if self.status == TestJob.RUNNING:
             self.status = TestJob.CANCELING
         else:

=== modified file 'lava_scheduler_app/views.py'
--- lava_scheduler_app/views.py	2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/views.py	2013-09-02 15:14:15 +0000
@@ -371,7 +371,8 @@ 
             .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),
                       offline=SumIf('device', condition='status in (%s,%s)' %
                                                         (Device.OFFLINE, Device.OFFLINING)),
-                      busy=SumIf('device', condition='status=%s' % Device.RUNNING),).order_by('name')
+                      busy=SumIf('device', condition='status in (%s,%s)' %
+                                                     (Device.RUNNING, Device.RESERVED)),).order_by('name')
 
     def render_status(self, record):
         return "%s idle, %s offline, %s busy" % (record.idle,
@@ -535,7 +536,7 @@ 
                 'health_jobs', reverse(health_jobs_json, kwargs=dict(pk=pk)),
                 params=(device,)),
             'show_maintenance': device.can_admin(request.user) and
-            device.status in [Device.IDLE, Device.RUNNING],
+            device.status in [Device.IDLE, Device.RUNNING, Device.RESERVED],
             'show_online': device.can_admin(request.user) and
             device.status in [Device.OFFLINE, Device.OFFLINING],
             'bread_crumb_trail': BreadCrumbTrail.leading_to(health_job_list, pk=pk),
@@ -993,7 +994,7 @@ 
                 'jobs', reverse(recent_jobs_json, kwargs=dict(pk=device.pk)),
                 params=(device,)),
             'show_maintenance': device.can_admin(request.user) and
-            device.status in [Device.IDLE, Device.RUNNING],
+            device.status in [Device.IDLE, Device.RUNNING, Device.RESERVED],
             'show_online': device.can_admin(request.user) and
             device.status in [Device.OFFLINE, Device.OFFLINING],
             'bread_crumb_trail': BreadCrumbTrail.leading_to(device_detail, pk=pk),

=== modified file 'lava_scheduler_daemon/dbjobsource.py'
--- lava_scheduler_daemon/dbjobsource.py	2013-08-31 01:38:21 +0000
+++ lava_scheduler_daemon/dbjobsource.py	2013-09-02 18:14:25 +0000
@@ -129,14 +129,18 @@ 
     def _fix_device(self, device, job):
         """Associate an available/idle DEVICE to the given JOB.
 
+        If the MultiNode job is waiting as Submitted, the device
+        could be running a different job.
         Returns the job with actual_device set to DEVICE.
 
         If we are unable to grab the DEVICE then we return None.
         """
+        if device.status == Device.RUNNING:
+            return None
         DeviceStateTransition.objects.create(
             created_by=None, device=device, old_state=device.status,
-            new_state=Device.RUNNING, message=None, job=job).save()
-        device.status = Device.RUNNING
+            new_state=Device.RESERVED, message=None, job=job).save()
+        device.status = Device.RESERVED
         device.current_job = job
         try:
             # The unique constraint on current_job may cause this to
@@ -190,10 +194,10 @@ 
                 for d in devices:
                     self.logger.debug("Checking %s" % d.hostname)
                     if d.hostname in configured_boards:
-                       if job:
-                           job = self._fix_device(d, job)
-                       if job:
-                           job_list.add(job)
+                        if job:
+                            job = self._fix_device(d, job)
+                        if job:
+                            job_list.add(job)
 
         # Remove scheduling multinode jobs until all the jobs in the
         # target_group are assigned devices.
@@ -288,6 +292,14 @@ 
 
     def getJobDetails_impl(self, job):
         job.status = TestJob.RUNNING
+        # need to set the device RUNNING if device was RESERVED
+        if job.actual_device.status == Device.RESERVED:
+            DeviceStateTransition.objects.create(
+                created_by=None, device=job.actual_device, old_state=job.actual_device.status,
+                new_state=Device.RUNNING, message=None, job=job).save()
+            job.actual_device.status = Device.RUNNING
+            job.actual_device.current_job = job
+            job.actual_device.save()
         job.start_time = datetime.datetime.utcnow()
         shutil.rmtree(job.output_dir, ignore_errors=True)
         job.log_file.save('job-%s.log' % job.id, ContentFile(''), save=False)
@@ -316,6 +328,8 @@ 
             device.status = Device.IDLE
         elif device.status == Device.OFFLINING:
             device.status = Device.OFFLINE
+        elif device.status == Device.RESERVED:
+            device.status = Device.IDLE
         else:
             self.logger.error(
                 "Unexpected device state in jobCompleted: %s" % device.status)

=== modified file 'lava_scheduler_daemon/service.py'
--- lava_scheduler_daemon/service.py	2013-08-30 18:07:18 +0000
+++ lava_scheduler_daemon/service.py	2013-09-02 18:14:53 +0000
@@ -47,7 +47,7 @@ 
             x.hostname for x in dispatcher_config.get_devices()]
 
         for job in job_list:
-            if job.actual_device.hostname in configured_boards:
+            if job.actual_device and job.actual_device.hostname in configured_boards:
                 new_job = JobRunner(self.source, job, self.dispatcher,
                                     self.reactor, self.daemon_options)
                 self.logger.info("Starting Job: %d " % job.id)