.gitlab-ci/lava/lava_job_submitter.py

7ec681f3Smrg#!/usr/bin/env python3
7ec681f3Smrg#
7ec681f3Smrg# Copyright (C) 2020, 2021 Collabora Limited
7ec681f3Smrg# Author: Gustavo Padovan <gustavo.padovan@collabora.com>
7ec681f3Smrg#
7ec681f3Smrg# Permission is hereby granted, free of charge, to any person obtaining a
7ec681f3Smrg# copy of this software and associated documentation files (the "Software"),
7ec681f3Smrg# to deal in the Software without restriction, including without limitation
7ec681f3Smrg# the rights to use, copy, modify, merge, publish, distribute, sublicense,
7ec681f3Smrg# and/or sell copies of the Software, and to permit persons to whom the
7ec681f3Smrg# Software is furnished to do so, subject to the following conditions:
7ec681f3Smrg#
7ec681f3Smrg# The above copyright notice and this permission notice (including the next
7ec681f3Smrg# paragraph) shall be included in all copies or substantial portions of the
7ec681f3Smrg# Software.
7ec681f3Smrg#
7ec681f3Smrg# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
7ec681f3Smrg# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
7ec681f3Smrg# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
7ec681f3Smrg# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
7ec681f3Smrg# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
7ec681f3Smrg# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
7ec681f3Smrg# SOFTWARE.
7ec681f3Smrg
7ec681f3Smrg"""Send a job to LAVA, track it and collect log back"""
7ec681f3Smrg
7ec681f3Smrgimport argparse
7ec681f3Smrgimport lavacli
7ec681f3Smrgimport os
7ec681f3Smrgimport sys
7ec681f3Smrgimport time
7ec681f3Smrgimport traceback
7ec681f3Smrgimport urllib.parse
7ec681f3Smrgimport xmlrpc
7ec681f3Smrgimport yaml
7ec681f3Smrg
7ec681f3Smrgfrom datetime import datetime, timedelta
7ec681f3Smrgfrom lavacli.utils import loader
7ec681f3Smrg
7ec681f3Smrg# Timeout in minutes to decide if the device from the dispatched LAVA job has
7ec681f3Smrg# hung or not due to the lack of new log output.
7ec681f3SmrgDEVICE_HANGING_TIMEOUT_MIN = 5
7ec681f3Smrg
7ec681f3Smrg# How many seconds the script should wait before try a new polling iteration to
7ec681f3Smrg# check if the dispatched LAVA job is running or waiting in the job queue.
7ec681f3SmrgWAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
7ec681f3Smrg
7ec681f3Smrg# How many seconds to wait between log output LAVA RPC calls.
7ec681f3SmrgLOG_POLLING_TIME_SEC = 5
7ec681f3Smrg
7ec681f3Smrg# How many retries should be made when a timeout happen.
7ec681f3SmrgNUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
7ec681f3Smrg
7ec681f3Smrg
7ec681f3Smrgdef print_log(msg):
7ec681f3Smrg    print("{}: {}".format(datetime.now(), msg))
7ec681f3Smrg
7ec681f3Smrgdef fatal_err(msg):
7ec681f3Smrg    print_log(msg)
7ec681f3Smrg    sys.exit(1)
7ec681f3Smrg
7ec681f3Smrgdef generate_lava_yaml(args):
7ec681f3Smrg    # General metadata and permissions, plus also inexplicably kernel arguments
7ec681f3Smrg    values = {
7ec681f3Smrg        'job_name': 'mesa: {}'.format(args.pipeline_info),
7ec681f3Smrg        'device_type': args.device_type,
7ec681f3Smrg        'visibility': { 'group': [ args.visibility_group ] },
7ec681f3Smrg        'priority': 75,
7ec681f3Smrg        'context': {
7ec681f3Smrg            'extra_nfsroot_args': ' init=/init rootwait minio_results={}'.format(args.job_artifacts_base)
7ec681f3Smrg        },
7ec681f3Smrg        'timeouts': {
7ec681f3Smrg            'job': {
7ec681f3Smrg                'minutes': args.job_timeout
7ec681f3Smrg            }
7ec681f3Smrg        },
7ec681f3Smrg    }
7ec681f3Smrg
7ec681f3Smrg    if args.lava_tags:
7ec681f3Smrg        values['tags'] = args.lava_tags.split(',')
7ec681f3Smrg
7ec681f3Smrg    # URLs to our kernel rootfs to boot from, both generated by the base
7ec681f3Smrg    # container build
7ec681f3Smrg    deploy = {
7ec681f3Smrg      'timeout': { 'minutes': 10 },
7ec681f3Smrg      'to': 'tftp',
7ec681f3Smrg      'os': 'oe',
7ec681f3Smrg      'kernel': {
7ec681f3Smrg        'url': '{}/{}'.format(args.base_system_url_prefix, args.kernel_image_name),
7ec681f3Smrg      },
7ec681f3Smrg      'nfsrootfs': {
7ec681f3Smrg        'url': '{}/lava-rootfs.tgz'.format(args.base_system_url_prefix),
7ec681f3Smrg        'compression': 'gz',
7ec681f3Smrg      }
7ec681f3Smrg    }
7ec681f3Smrg    if args.kernel_image_type:
7ec681f3Smrg        deploy['kernel']['type'] = args.kernel_image_type
7ec681f3Smrg    if args.dtb:
7ec681f3Smrg        deploy['dtb'] = {
7ec681f3Smrg          'url': '{}/{}.dtb'.format(args.base_system_url_prefix, args.dtb)
7ec681f3Smrg        }
7ec681f3Smrg
7ec681f3Smrg    # always boot over NFS
7ec681f3Smrg    boot = {
7ec681f3Smrg      'timeout': { 'minutes': 25 },
7ec681f3Smrg      'method': args.boot_method,
7ec681f3Smrg      'commands': 'nfs',
7ec681f3Smrg      'prompts': ['lava-shell:'],
7ec681f3Smrg    }
7ec681f3Smrg
7ec681f3Smrg    # skeleton test definition: only declaring each job as a single 'test'
7ec681f3Smrg    # since LAVA's test parsing is not useful to us
7ec681f3Smrg    test = {
7ec681f3Smrg      'timeout': { 'minutes': args.job_timeout },
7ec681f3Smrg      'failure_retry': 1,
7ec681f3Smrg      'definitions': [ {
7ec681f3Smrg        'name': 'mesa',
7ec681f3Smrg        'from': 'inline',
7ec681f3Smrg        'path': 'inline/mesa.yaml',
7ec681f3Smrg        'repository': {
7ec681f3Smrg          'metadata': {
7ec681f3Smrg            'name': 'mesa',
7ec681f3Smrg            'description': 'Mesa test plan',
7ec681f3Smrg            'os': [ 'oe' ],
7ec681f3Smrg            'scope': [ 'functional' ],
7ec681f3Smrg            'format': 'Lava-Test Test Definition 1.0',
7ec681f3Smrg          },
7ec681f3Smrg          'parse': {
7ec681f3Smrg            'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
7ec681f3Smrg          },
7ec681f3Smrg          'run': {
7ec681f3Smrg          },
7ec681f3Smrg        },
7ec681f3Smrg      } ],
7ec681f3Smrg    }
7ec681f3Smrg
7ec681f3Smrg    # job execution script:
7ec681f3Smrg    #   - inline .gitlab-ci/common/init-stage1.sh
7ec681f3Smrg    #   - fetch and unpack per-pipeline build artifacts from build job
7ec681f3Smrg    #   - fetch and unpack per-job environment from lava-submit.sh
7ec681f3Smrg    #   - exec .gitlab-ci/common/init-stage2.sh
7ec681f3Smrg    init_lines = []
7ec681f3Smrg    with open(args.first_stage_init, 'r') as init_sh:
7ec681f3Smrg      init_lines += [ x.rstrip() for x in init_sh if not x.startswith('#') and x.rstrip() ]
7ec681f3Smrg    init_lines += [
7ec681f3Smrg      'mkdir -p {}'.format(args.ci_project_dir),
7ec681f3Smrg      'wget -S --progress=dot:giga -O- {} | tar -xz -C {}'.format(args.mesa_build_url, args.ci_project_dir),
7ec681f3Smrg      'wget -S --progress=dot:giga -O- {} | tar -xz -C /'.format(args.job_rootfs_overlay_url),
7ec681f3Smrg      'set +x',
7ec681f3Smrg      'export CI_JOB_JWT="{}"'.format(args.jwt),
7ec681f3Smrg      'set -x',
7ec681f3Smrg      'exec /init-stage2.sh',
7ec681f3Smrg    ]
7ec681f3Smrg    test['definitions'][0]['repository']['run']['steps'] = init_lines
7ec681f3Smrg
7ec681f3Smrg    values['actions'] = [
7ec681f3Smrg      { 'deploy': deploy },
7ec681f3Smrg      { 'boot': boot },
7ec681f3Smrg      { 'test': test },
7ec681f3Smrg    ]
7ec681f3Smrg
7ec681f3Smrg    return yaml.dump(values, width=10000000)
7ec681f3Smrg
7ec681f3Smrg
7ec681f3Smrgdef setup_lava_proxy():
7ec681f3Smrg    config = lavacli.load_config("default")
7ec681f3Smrg    uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
7ec681f3Smrg    uri_obj = urllib.parse.urlparse(uri)
7ec681f3Smrg    uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path)
7ec681f3Smrg    transport = lavacli.RequestsTransport(
7ec681f3Smrg        uri_obj.scheme,
7ec681f3Smrg        config.get("proxy"),
7ec681f3Smrg        config.get("timeout", 120.0),
7ec681f3Smrg        config.get("verify_ssl_cert", True),
7ec681f3Smrg    )
7ec681f3Smrg    proxy = xmlrpc.client.ServerProxy(
7ec681f3Smrg        uri_str, allow_none=True, transport=transport)
7ec681f3Smrg
7ec681f3Smrg    print_log("Proxy for {} created.".format(config['uri']))
7ec681f3Smrg
7ec681f3Smrg    return proxy
7ec681f3Smrg
7ec681f3Smrg
7ec681f3Smrgdef _call_proxy(fn, *args):
7ec681f3Smrg    retries = 60
7ec681f3Smrg    for n in range(1, retries + 1):
7ec681f3Smrg        try:
7ec681f3Smrg            return fn(*args)
7ec681f3Smrg        except xmlrpc.client.ProtocolError as err:
7ec681f3Smrg            if n == retries:
7ec681f3Smrg                traceback.print_exc()
7ec681f3Smrg                fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
7ec681f3Smrg            else:
7ec681f3Smrg                time.sleep(15)
7ec681f3Smrg                pass
7ec681f3Smrg        except xmlrpc.client.Fault as err:
7ec681f3Smrg            traceback.print_exc()
7ec681f3Smrg            fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
7ec681f3Smrg
7ec681f3Smrg
7ec681f3Smrgdef get_job_results(proxy, job_id, test_suite, test_case):
7ec681f3Smrg    # Look for infrastructure errors and retry if we see them.
7ec681f3Smrg    results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
7ec681f3Smrg    results = yaml.load(results_yaml, Loader=loader(False))
7ec681f3Smrg    for res in results:
7ec681f3Smrg        metadata = res['metadata']
7ec681f3Smrg        if not 'result' in metadata or metadata['result'] != 'fail':
7ec681f3Smrg            continue
7ec681f3Smrg        if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
7ec681f3Smrg            print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
7ec681f3Smrg            return False
7ec681f3Smrg        if 'case' in metadata and metadata['case'] == "validate":
7ec681f3Smrg            print_log("LAVA job {} failed validation (possible download error). Retry.".format(job_id))
7ec681f3Smrg            return False
7ec681f3Smrg
7ec681f3Smrg    results_yaml = _call_proxy(proxy.results.get_testcase_results_yaml, job_id, test_suite, test_case)
7ec681f3Smrg    results = yaml.load(results_yaml, Loader=loader(False))
7ec681f3Smrg    if not results:
7ec681f3Smrg        fatal_err("LAVA: no result for test_suite '{}', test_case '{}'".format(test_suite, test_case))
7ec681f3Smrg
7ec681f3Smrg    print_log("LAVA: result for test_suite '{}', test_case '{}': {}".format(test_suite, test_case, results[0]['result']))
7ec681f3Smrg    if results[0]['result'] != 'pass':
7ec681f3Smrg        fatal_err("FAIL")
7ec681f3Smrg
7ec681f3Smrg    return True
7ec681f3Smrg
7ec681f3Smrgdef wait_until_job_is_started(proxy, job_id):
7ec681f3Smrg    print_log(f"Waiting for job {job_id} to start.")
7ec681f3Smrg    current_state = "Submitted"
7ec681f3Smrg    waiting_states = ["Submitted", "Scheduling", "Scheduled"]
7ec681f3Smrg    while current_state in waiting_states:
7ec681f3Smrg        job_state = _call_proxy(proxy.scheduler.job_state, job_id)
7ec681f3Smrg        current_state = job_state["job_state"]
7ec681f3Smrg
7ec681f3Smrg        time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
7ec681f3Smrg    print_log(f"Job {job_id} started.")
7ec681f3Smrg
7ec681f3Smrgdef follow_job_execution(proxy, job_id):
7ec681f3Smrg    line_count = 0
7ec681f3Smrg    finished = False
7ec681f3Smrg    last_time_logs = datetime.now()
7ec681f3Smrg    while not finished:
7ec681f3Smrg        (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
7ec681f3Smrg        logs = yaml.load(str(data), Loader=loader(False))
7ec681f3Smrg        if logs:
7ec681f3Smrg            # Reset the timeout
7ec681f3Smrg            last_time_logs = datetime.now()
7ec681f3Smrg            for line in logs:
7ec681f3Smrg                print("{} {}".format(line["dt"], line["msg"]))
7ec681f3Smrg
7ec681f3Smrg            line_count += len(logs)
7ec681f3Smrg
7ec681f3Smrg        else:
7ec681f3Smrg            time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
7ec681f3Smrg            if datetime.now() - last_time_logs > time_limit:
7ec681f3Smrg                print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
7ec681f3Smrg                return False
7ec681f3Smrg
7ec681f3Smrg        # `proxy.scheduler.jobs.logs` does not block, even when there is no
7ec681f3Smrg        # new log to be fetched. To avoid dosing the LAVA dispatcher
7ec681f3Smrg        # machine, let's add a sleep to save them some stamina.
7ec681f3Smrg        time.sleep(LOG_POLLING_TIME_SEC)
7ec681f3Smrg
7ec681f3Smrg    return True
7ec681f3Smrg
7ec681f3Smrgdef show_job_data(proxy, job_id):
7ec681f3Smrg    show = _call_proxy(proxy.scheduler.jobs.show, job_id)
7ec681f3Smrg    for field, value in show.items():
7ec681f3Smrg        print("{}\t: {}".format(field, value))
7ec681f3Smrg
7ec681f3Smrg
7ec681f3Smrgdef validate_job(proxy, job_file):
7ec681f3Smrg    try:
7ec681f3Smrg        return _call_proxy(proxy.scheduler.jobs.validate, job_file, True)
7ec681f3Smrg    except:
7ec681f3Smrg        return False
7ec681f3Smrg
7ec681f3Smrgdef submit_job(proxy, job_file):
7ec681f3Smrg    return _call_proxy(proxy.scheduler.jobs.submit, job_file)
7ec681f3Smrg
7ec681f3Smrg
7ec681f3Smrgdef main(args):
7ec681f3Smrg    proxy = setup_lava_proxy()
7ec681f3Smrg
7ec681f3Smrg    yaml_file = generate_lava_yaml(args)
7ec681f3Smrg
7ec681f3Smrg    if args.dump_yaml:
7ec681f3Smrg        censored_args = args
7ec681f3Smrg        censored_args.jwt = "jwt-hidden"
7ec681f3Smrg        print(generate_lava_yaml(censored_args))
7ec681f3Smrg
7ec681f3Smrg    if args.validate_only:
7ec681f3Smrg        ret = validate_job(proxy, yaml_file)
7ec681f3Smrg        if not ret:
7ec681f3Smrg            fatal_err("Error in LAVA job definition")
7ec681f3Smrg        print("LAVA job definition validated successfully")
7ec681f3Smrg        return
7ec681f3Smrg
7ec681f3Smrg    retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
7ec681f3Smrg
7ec681f3Smrg    while retry_count >= 0:
7ec681f3Smrg        job_id = submit_job(proxy, yaml_file)
7ec681f3Smrg
7ec681f3Smrg        print_log("LAVA job id: {}".format(job_id))
7ec681f3Smrg
7ec681f3Smrg        wait_until_job_is_started(proxy, job_id)
7ec681f3Smrg
7ec681f3Smrg        if not follow_job_execution(proxy, job_id):
7ec681f3Smrg            print_log(f"Job {job_id} has timed out. Cancelling it.")
7ec681f3Smrg            # Cancel the job as it is considered unreachable by Mesa CI.
7ec681f3Smrg            proxy.scheduler.jobs.cancel(job_id)
7ec681f3Smrg
7ec681f3Smrg            retry_count -= 1
7ec681f3Smrg            continue
7ec681f3Smrg
7ec681f3Smrg        show_job_data(proxy, job_id)
7ec681f3Smrg
7ec681f3Smrg        if get_job_results(proxy,  job_id, "0_mesa", "mesa") == True:
7ec681f3Smrg             break
7ec681f3Smrg
7ec681f3Smrg
7ec681f3Smrgif __name__ == '__main__':
7ec681f3Smrg    # given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us ->
7ec681f3Smrg    # GitLab runner -> GitLab primary -> user, safe to say we don't need any
7ec681f3Smrg    # more buffering
7ec681f3Smrg    sys.stdout.reconfigure(line_buffering=True)
7ec681f3Smrg    sys.stderr.reconfigure(line_buffering=True)
7ec681f3Smrg    parser = argparse.ArgumentParser("LAVA job submitter")
7ec681f3Smrg
7ec681f3Smrg    parser.add_argument("--pipeline-info")
7ec681f3Smrg    parser.add_argument("--base-system-url-prefix")
7ec681f3Smrg    parser.add_argument("--mesa-build-url")
7ec681f3Smrg    parser.add_argument("--job-rootfs-overlay-url")
7ec681f3Smrg    parser.add_argument("--job-artifacts-base")
7ec681f3Smrg    parser.add_argument("--job-timeout", type=int)
7ec681f3Smrg    parser.add_argument("--first-stage-init")
7ec681f3Smrg    parser.add_argument("--ci-project-dir")
7ec681f3Smrg    parser.add_argument("--device-type")
7ec681f3Smrg    parser.add_argument("--dtb", nargs='?', default="")
7ec681f3Smrg    parser.add_argument("--kernel-image-name")
7ec681f3Smrg    parser.add_argument("--kernel-image-type", nargs='?', default="")
7ec681f3Smrg    parser.add_argument("--boot-method")
7ec681f3Smrg    parser.add_argument("--lava-tags", nargs='?', default="")
7ec681f3Smrg    parser.add_argument("--jwt")
7ec681f3Smrg    parser.add_argument("--validate-only", action='store_true')
7ec681f3Smrg    parser.add_argument("--dump-yaml", action='store_true')
7ec681f3Smrg    parser.add_argument("--visibility-group")
7ec681f3Smrg
7ec681f3Smrg    parser.set_defaults(func=main)
7ec681f3Smrg    args = parser.parse_args()
7ec681f3Smrg    args.func(args)