17ec681f3Smrg#!/usr/bin/env python3
27ec681f3Smrg#
37ec681f3Smrg# Copyright (C) 2020, 2021 Collabora Limited
47ec681f3Smrg# Author: Gustavo Padovan <gustavo.padovan@collabora.com>
57ec681f3Smrg#
67ec681f3Smrg# Permission is hereby granted, free of charge, to any person obtaining a
77ec681f3Smrg# copy of this software and associated documentation files (the "Software"),
87ec681f3Smrg# to deal in the Software without restriction, including without limitation
97ec681f3Smrg# the rights to use, copy, modify, merge, publish, distribute, sublicense,
107ec681f3Smrg# and/or sell copies of the Software, and to permit persons to whom the
117ec681f3Smrg# Software is furnished to do so, subject to the following conditions:
127ec681f3Smrg#
137ec681f3Smrg# The above copyright notice and this permission notice (including the next
147ec681f3Smrg# paragraph) shall be included in all copies or substantial portions of the
157ec681f3Smrg# Software.
167ec681f3Smrg#
177ec681f3Smrg# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
187ec681f3Smrg# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
197ec681f3Smrg# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
207ec681f3Smrg# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
217ec681f3Smrg# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
227ec681f3Smrg# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
237ec681f3Smrg# SOFTWARE.
247ec681f3Smrg
257ec681f3Smrg"""Send a job to LAVA, track it and collect log back"""
267ec681f3Smrg
277ec681f3Smrgimport argparse
287ec681f3Smrgimport lavacli
297ec681f3Smrgimport os
307ec681f3Smrgimport sys
317ec681f3Smrgimport time
327ec681f3Smrgimport traceback
337ec681f3Smrgimport urllib.parse
347ec681f3Smrgimport xmlrpc
357ec681f3Smrgimport yaml
367ec681f3Smrg
377ec681f3Smrgfrom datetime import datetime, timedelta
387ec681f3Smrgfrom lavacli.utils import loader
397ec681f3Smrg
407ec681f3Smrg# Timeout in minutes to decide if the device from the dispatched LAVA job has
417ec681f3Smrg# hung or not due to the lack of new log output.
427ec681f3SmrgDEVICE_HANGING_TIMEOUT_MIN = 5
437ec681f3Smrg
447ec681f3Smrg# How many seconds the script should wait before try a new polling iteration to
457ec681f3Smrg# check if the dispatched LAVA job is running or waiting in the job queue.
467ec681f3SmrgWAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
477ec681f3Smrg
487ec681f3Smrg# How many seconds to wait between log output LAVA RPC calls.
497ec681f3SmrgLOG_POLLING_TIME_SEC = 5
507ec681f3Smrg
517ec681f3Smrg# How many retries should be made when a timeout happen.
527ec681f3SmrgNUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
537ec681f3Smrg
547ec681f3Smrg
557ec681f3Smrgdef print_log(msg):
567ec681f3Smrg    print("{}: {}".format(datetime.now(), msg))
577ec681f3Smrg
587ec681f3Smrgdef fatal_err(msg):
597ec681f3Smrg    print_log(msg)
607ec681f3Smrg    sys.exit(1)
617ec681f3Smrg
627ec681f3Smrgdef generate_lava_yaml(args):
637ec681f3Smrg    # General metadata and permissions, plus also inexplicably kernel arguments
647ec681f3Smrg    values = {
657ec681f3Smrg        'job_name': 'mesa: {}'.format(args.pipeline_info),
667ec681f3Smrg        'device_type': args.device_type,
677ec681f3Smrg        'visibility': { 'group': [ args.visibility_group ] },
687ec681f3Smrg        'priority': 75,
697ec681f3Smrg        'context': {
707ec681f3Smrg            'extra_nfsroot_args': ' init=/init rootwait minio_results={}'.format(args.job_artifacts_base)
717ec681f3Smrg        },
727ec681f3Smrg        'timeouts': {
737ec681f3Smrg            'job': {
747ec681f3Smrg                'minutes': args.job_timeout
757ec681f3Smrg            }
767ec681f3Smrg        },
777ec681f3Smrg    }
787ec681f3Smrg
797ec681f3Smrg    if args.lava_tags:
807ec681f3Smrg        values['tags'] = args.lava_tags.split(',')
817ec681f3Smrg
827ec681f3Smrg    # URLs to our kernel rootfs to boot from, both generated by the base
837ec681f3Smrg    # container build
847ec681f3Smrg    deploy = {
857ec681f3Smrg      'timeout': { 'minutes': 10 },
867ec681f3Smrg      'to': 'tftp',
877ec681f3Smrg      'os': 'oe',
887ec681f3Smrg      'kernel': {
897ec681f3Smrg        'url': '{}/{}'.format(args.base_system_url_prefix, args.kernel_image_name),
907ec681f3Smrg      },
917ec681f3Smrg      'nfsrootfs': {
927ec681f3Smrg        'url': '{}/lava-rootfs.tgz'.format(args.base_system_url_prefix),
937ec681f3Smrg        'compression': 'gz',
947ec681f3Smrg      }
957ec681f3Smrg    }
967ec681f3Smrg    if args.kernel_image_type:
977ec681f3Smrg        deploy['kernel']['type'] = args.kernel_image_type
987ec681f3Smrg    if args.dtb:
997ec681f3Smrg        deploy['dtb'] = {
1007ec681f3Smrg          'url': '{}/{}.dtb'.format(args.base_system_url_prefix, args.dtb)
1017ec681f3Smrg        }
1027ec681f3Smrg
1037ec681f3Smrg    # always boot over NFS
1047ec681f3Smrg    boot = {
1057ec681f3Smrg      'timeout': { 'minutes': 25 },
1067ec681f3Smrg      'method': args.boot_method,
1077ec681f3Smrg      'commands': 'nfs',
1087ec681f3Smrg      'prompts': ['lava-shell:'],
1097ec681f3Smrg    }
1107ec681f3Smrg
1117ec681f3Smrg    # skeleton test definition: only declaring each job as a single 'test'
1127ec681f3Smrg    # since LAVA's test parsing is not useful to us
1137ec681f3Smrg    test = {
1147ec681f3Smrg      'timeout': { 'minutes': args.job_timeout },
1157ec681f3Smrg      'failure_retry': 1,
1167ec681f3Smrg      'definitions': [ {
1177ec681f3Smrg        'name': 'mesa',
1187ec681f3Smrg        'from': 'inline',
1197ec681f3Smrg        'path': 'inline/mesa.yaml',
1207ec681f3Smrg        'repository': {
1217ec681f3Smrg          'metadata': {
1227ec681f3Smrg            'name': 'mesa',
1237ec681f3Smrg            'description': 'Mesa test plan',
1247ec681f3Smrg            'os': [ 'oe' ],
1257ec681f3Smrg            'scope': [ 'functional' ],
1267ec681f3Smrg            'format': 'Lava-Test Test Definition 1.0',
1277ec681f3Smrg          },
1287ec681f3Smrg          'parse': {
1297ec681f3Smrg            'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
1307ec681f3Smrg          },
1317ec681f3Smrg          'run': {
1327ec681f3Smrg          },
1337ec681f3Smrg        },
1347ec681f3Smrg      } ],
1357ec681f3Smrg    }
1367ec681f3Smrg
1377ec681f3Smrg    # job execution script:
1387ec681f3Smrg    #   - inline .gitlab-ci/common/init-stage1.sh
1397ec681f3Smrg    #   - fetch and unpack per-pipeline build artifacts from build job
1407ec681f3Smrg    #   - fetch and unpack per-job environment from lava-submit.sh
1417ec681f3Smrg    #   - exec .gitlab-ci/common/init-stage2.sh
1427ec681f3Smrg    init_lines = []
1437ec681f3Smrg    with open(args.first_stage_init, 'r') as init_sh:
1447ec681f3Smrg      init_lines += [ x.rstrip() for x in init_sh if not x.startswith('#') and x.rstrip() ]
1457ec681f3Smrg    init_lines += [
1467ec681f3Smrg      'mkdir -p {}'.format(args.ci_project_dir),
1477ec681f3Smrg      'wget -S --progress=dot:giga -O- {} | tar -xz -C {}'.format(args.mesa_build_url, args.ci_project_dir),
1487ec681f3Smrg      'wget -S --progress=dot:giga -O- {} | tar -xz -C /'.format(args.job_rootfs_overlay_url),
1497ec681f3Smrg      'set +x',
1507ec681f3Smrg      'export CI_JOB_JWT="{}"'.format(args.jwt),
1517ec681f3Smrg      'set -x',
1527ec681f3Smrg      'exec /init-stage2.sh',
1537ec681f3Smrg    ]
1547ec681f3Smrg    test['definitions'][0]['repository']['run']['steps'] = init_lines
1557ec681f3Smrg
1567ec681f3Smrg    values['actions'] = [
1577ec681f3Smrg      { 'deploy': deploy },
1587ec681f3Smrg      { 'boot': boot },
1597ec681f3Smrg      { 'test': test },
1607ec681f3Smrg    ]
1617ec681f3Smrg
1627ec681f3Smrg    return yaml.dump(values, width=10000000)
1637ec681f3Smrg
1647ec681f3Smrg
1657ec681f3Smrgdef setup_lava_proxy():
1667ec681f3Smrg    config = lavacli.load_config("default")
1677ec681f3Smrg    uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
1687ec681f3Smrg    uri_obj = urllib.parse.urlparse(uri)
1697ec681f3Smrg    uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path)
1707ec681f3Smrg    transport = lavacli.RequestsTransport(
1717ec681f3Smrg        uri_obj.scheme,
1727ec681f3Smrg        config.get("proxy"),
1737ec681f3Smrg        config.get("timeout", 120.0),
1747ec681f3Smrg        config.get("verify_ssl_cert", True),
1757ec681f3Smrg    )
1767ec681f3Smrg    proxy = xmlrpc.client.ServerProxy(
1777ec681f3Smrg        uri_str, allow_none=True, transport=transport)
1787ec681f3Smrg
1797ec681f3Smrg    print_log("Proxy for {} created.".format(config['uri']))
1807ec681f3Smrg
1817ec681f3Smrg    return proxy
1827ec681f3Smrg
1837ec681f3Smrg
1847ec681f3Smrgdef _call_proxy(fn, *args):
1857ec681f3Smrg    retries = 60
1867ec681f3Smrg    for n in range(1, retries + 1):
1877ec681f3Smrg        try:
1887ec681f3Smrg            return fn(*args)
1897ec681f3Smrg        except xmlrpc.client.ProtocolError as err:
1907ec681f3Smrg            if n == retries:
1917ec681f3Smrg                traceback.print_exc()
1927ec681f3Smrg                fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
1937ec681f3Smrg            else:
1947ec681f3Smrg                time.sleep(15)
1957ec681f3Smrg                pass
1967ec681f3Smrg        except xmlrpc.client.Fault as err:
1977ec681f3Smrg            traceback.print_exc()
1987ec681f3Smrg            fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
1997ec681f3Smrg
2007ec681f3Smrg
2017ec681f3Smrgdef get_job_results(proxy, job_id, test_suite, test_case):
2027ec681f3Smrg    # Look for infrastructure errors and retry if we see them.
2037ec681f3Smrg    results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
2047ec681f3Smrg    results = yaml.load(results_yaml, Loader=loader(False))
2057ec681f3Smrg    for res in results:
2067ec681f3Smrg        metadata = res['metadata']
2077ec681f3Smrg        if not 'result' in metadata or metadata['result'] != 'fail':
2087ec681f3Smrg            continue
2097ec681f3Smrg        if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
2107ec681f3Smrg            print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
2117ec681f3Smrg            return False
2127ec681f3Smrg        if 'case' in metadata and metadata['case'] == "validate":
2137ec681f3Smrg            print_log("LAVA job {} failed validation (possible download error). Retry.".format(job_id))
2147ec681f3Smrg            return False
2157ec681f3Smrg
2167ec681f3Smrg    results_yaml = _call_proxy(proxy.results.get_testcase_results_yaml, job_id, test_suite, test_case)
2177ec681f3Smrg    results = yaml.load(results_yaml, Loader=loader(False))
2187ec681f3Smrg    if not results:
2197ec681f3Smrg        fatal_err("LAVA: no result for test_suite '{}', test_case '{}'".format(test_suite, test_case))
2207ec681f3Smrg
2217ec681f3Smrg    print_log("LAVA: result for test_suite '{}', test_case '{}': {}".format(test_suite, test_case, results[0]['result']))
2227ec681f3Smrg    if results[0]['result'] != 'pass':
2237ec681f3Smrg        fatal_err("FAIL")
2247ec681f3Smrg
2257ec681f3Smrg    return True
2267ec681f3Smrg
2277ec681f3Smrgdef wait_until_job_is_started(proxy, job_id):
2287ec681f3Smrg    print_log(f"Waiting for job {job_id} to start.")
2297ec681f3Smrg    current_state = "Submitted"
2307ec681f3Smrg    waiting_states = ["Submitted", "Scheduling", "Scheduled"]
2317ec681f3Smrg    while current_state in waiting_states:
2327ec681f3Smrg        job_state = _call_proxy(proxy.scheduler.job_state, job_id)
2337ec681f3Smrg        current_state = job_state["job_state"]
2347ec681f3Smrg
2357ec681f3Smrg        time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
2367ec681f3Smrg    print_log(f"Job {job_id} started.")
2377ec681f3Smrg
2387ec681f3Smrgdef follow_job_execution(proxy, job_id):
2397ec681f3Smrg    line_count = 0
2407ec681f3Smrg    finished = False
2417ec681f3Smrg    last_time_logs = datetime.now()
2427ec681f3Smrg    while not finished:
2437ec681f3Smrg        (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
2447ec681f3Smrg        logs = yaml.load(str(data), Loader=loader(False))
2457ec681f3Smrg        if logs:
2467ec681f3Smrg            # Reset the timeout
2477ec681f3Smrg            last_time_logs = datetime.now()
2487ec681f3Smrg            for line in logs:
2497ec681f3Smrg                print("{} {}".format(line["dt"], line["msg"]))
2507ec681f3Smrg
2517ec681f3Smrg            line_count += len(logs)
2527ec681f3Smrg
2537ec681f3Smrg        else:
2547ec681f3Smrg            time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
2557ec681f3Smrg            if datetime.now() - last_time_logs > time_limit:
2567ec681f3Smrg                print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
2577ec681f3Smrg                return False
2587ec681f3Smrg
2597ec681f3Smrg        # `proxy.scheduler.jobs.logs` does not block, even when there is no
2607ec681f3Smrg        # new log to be fetched. To avoid dosing the LAVA dispatcher
2617ec681f3Smrg        # machine, let's add a sleep to save them some stamina.
2627ec681f3Smrg        time.sleep(LOG_POLLING_TIME_SEC)
2637ec681f3Smrg
2647ec681f3Smrg    return True
2657ec681f3Smrg
2667ec681f3Smrgdef show_job_data(proxy, job_id):
2677ec681f3Smrg    show = _call_proxy(proxy.scheduler.jobs.show, job_id)
2687ec681f3Smrg    for field, value in show.items():
2697ec681f3Smrg        print("{}\t: {}".format(field, value))
2707ec681f3Smrg
2717ec681f3Smrg
2727ec681f3Smrgdef validate_job(proxy, job_file):
2737ec681f3Smrg    try:
2747ec681f3Smrg        return _call_proxy(proxy.scheduler.jobs.validate, job_file, True)
2757ec681f3Smrg    except:
2767ec681f3Smrg        return False
2777ec681f3Smrg
2787ec681f3Smrgdef submit_job(proxy, job_file):
2797ec681f3Smrg    return _call_proxy(proxy.scheduler.jobs.submit, job_file)
2807ec681f3Smrg
2817ec681f3Smrg
2827ec681f3Smrgdef main(args):
2837ec681f3Smrg    proxy = setup_lava_proxy()
2847ec681f3Smrg
2857ec681f3Smrg    yaml_file = generate_lava_yaml(args)
2867ec681f3Smrg
2877ec681f3Smrg    if args.dump_yaml:
2887ec681f3Smrg        censored_args = args
2897ec681f3Smrg        censored_args.jwt = "jwt-hidden"
2907ec681f3Smrg        print(generate_lava_yaml(censored_args))
2917ec681f3Smrg
2927ec681f3Smrg    if args.validate_only:
2937ec681f3Smrg        ret = validate_job(proxy, yaml_file)
2947ec681f3Smrg        if not ret:
2957ec681f3Smrg            fatal_err("Error in LAVA job definition")
2967ec681f3Smrg        print("LAVA job definition validated successfully")
2977ec681f3Smrg        return
2987ec681f3Smrg
2997ec681f3Smrg    retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
3007ec681f3Smrg
3017ec681f3Smrg    while retry_count >= 0:
3027ec681f3Smrg        job_id = submit_job(proxy, yaml_file)
3037ec681f3Smrg
3047ec681f3Smrg        print_log("LAVA job id: {}".format(job_id))
3057ec681f3Smrg
3067ec681f3Smrg        wait_until_job_is_started(proxy, job_id)
3077ec681f3Smrg
3087ec681f3Smrg        if not follow_job_execution(proxy, job_id):
3097ec681f3Smrg            print_log(f"Job {job_id} has timed out. Cancelling it.")
3107ec681f3Smrg            # Cancel the job as it is considered unreachable by Mesa CI.
3117ec681f3Smrg            proxy.scheduler.jobs.cancel(job_id)
3127ec681f3Smrg
3137ec681f3Smrg            retry_count -= 1
3147ec681f3Smrg            continue
3157ec681f3Smrg
3167ec681f3Smrg        show_job_data(proxy, job_id)
3177ec681f3Smrg
3187ec681f3Smrg        if get_job_results(proxy,  job_id, "0_mesa", "mesa") == True:
3197ec681f3Smrg             break
3207ec681f3Smrg
3217ec681f3Smrg
3227ec681f3Smrgif __name__ == '__main__':
3237ec681f3Smrg    # given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us ->
3247ec681f3Smrg    # GitLab runner -> GitLab primary -> user, safe to say we don't need any
3257ec681f3Smrg    # more buffering
3267ec681f3Smrg    sys.stdout.reconfigure(line_buffering=True)
3277ec681f3Smrg    sys.stderr.reconfigure(line_buffering=True)
3287ec681f3Smrg    parser = argparse.ArgumentParser("LAVA job submitter")
3297ec681f3Smrg
3307ec681f3Smrg    parser.add_argument("--pipeline-info")
3317ec681f3Smrg    parser.add_argument("--base-system-url-prefix")
3327ec681f3Smrg    parser.add_argument("--mesa-build-url")
3337ec681f3Smrg    parser.add_argument("--job-rootfs-overlay-url")
3347ec681f3Smrg    parser.add_argument("--job-artifacts-base")
3357ec681f3Smrg    parser.add_argument("--job-timeout", type=int)
3367ec681f3Smrg    parser.add_argument("--first-stage-init")
3377ec681f3Smrg    parser.add_argument("--ci-project-dir")
3387ec681f3Smrg    parser.add_argument("--device-type")
3397ec681f3Smrg    parser.add_argument("--dtb", nargs='?', default="")
3407ec681f3Smrg    parser.add_argument("--kernel-image-name")
3417ec681f3Smrg    parser.add_argument("--kernel-image-type", nargs='?', default="")
3427ec681f3Smrg    parser.add_argument("--boot-method")
3437ec681f3Smrg    parser.add_argument("--lava-tags", nargs='?', default="")
3447ec681f3Smrg    parser.add_argument("--jwt")
3457ec681f3Smrg    parser.add_argument("--validate-only", action='store_true')
3467ec681f3Smrg    parser.add_argument("--dump-yaml", action='store_true')
3477ec681f3Smrg    parser.add_argument("--visibility-group")
3487ec681f3Smrg
3497ec681f3Smrg    parser.set_defaults(func=main)
3507ec681f3Smrg    args = parser.parse_args()
3517ec681f3Smrg    args.func(args)
352