17ec681f3Smrg#!/usr/bin/env python3 27ec681f3Smrg# 37ec681f3Smrg# Copyright (C) 2020, 2021 Collabora Limited 47ec681f3Smrg# Author: Gustavo Padovan <gustavo.padovan@collabora.com> 57ec681f3Smrg# 67ec681f3Smrg# Permission is hereby granted, free of charge, to any person obtaining a 77ec681f3Smrg# copy of this software and associated documentation files (the "Software"), 87ec681f3Smrg# to deal in the Software without restriction, including without limitation 97ec681f3Smrg# the rights to use, copy, modify, merge, publish, distribute, sublicense, 107ec681f3Smrg# and/or sell copies of the Software, and to permit persons to whom the 117ec681f3Smrg# Software is furnished to do so, subject to the following conditions: 127ec681f3Smrg# 137ec681f3Smrg# The above copyright notice and this permission notice (including the next 147ec681f3Smrg# paragraph) shall be included in all copies or substantial portions of the 157ec681f3Smrg# Software. 167ec681f3Smrg# 177ec681f3Smrg# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 187ec681f3Smrg# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 197ec681f3Smrg# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 207ec681f3Smrg# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 217ec681f3Smrg# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 227ec681f3Smrg# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 237ec681f3Smrg# SOFTWARE. 247ec681f3Smrg 257ec681f3Smrg"""Send a job to LAVA, track it and collect log back""" 267ec681f3Smrg 277ec681f3Smrgimport argparse 287ec681f3Smrgimport lavacli 297ec681f3Smrgimport os 307ec681f3Smrgimport sys 317ec681f3Smrgimport time 327ec681f3Smrgimport traceback 337ec681f3Smrgimport urllib.parse 347ec681f3Smrgimport xmlrpc 357ec681f3Smrgimport yaml 367ec681f3Smrg 377ec681f3Smrgfrom datetime import datetime, timedelta 387ec681f3Smrgfrom lavacli.utils import loader 397ec681f3Smrg 407ec681f3Smrg# Timeout in minutes to decide if the device from the dispatched LAVA job has 417ec681f3Smrg# hung or not due to the lack of new log output. 427ec681f3SmrgDEVICE_HANGING_TIMEOUT_MIN = 5 437ec681f3Smrg 447ec681f3Smrg# How many seconds the script should wait before try a new polling iteration to 457ec681f3Smrg# check if the dispatched LAVA job is running or waiting in the job queue. 467ec681f3SmrgWAIT_FOR_DEVICE_POLLING_TIME_SEC = 10 477ec681f3Smrg 487ec681f3Smrg# How many seconds to wait between log output LAVA RPC calls. 497ec681f3SmrgLOG_POLLING_TIME_SEC = 5 507ec681f3Smrg 517ec681f3Smrg# How many retries should be made when a timeout happen. 527ec681f3SmrgNUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2 537ec681f3Smrg 547ec681f3Smrg 557ec681f3Smrgdef print_log(msg): 567ec681f3Smrg print("{}: {}".format(datetime.now(), msg)) 577ec681f3Smrg 587ec681f3Smrgdef fatal_err(msg): 597ec681f3Smrg print_log(msg) 607ec681f3Smrg sys.exit(1) 617ec681f3Smrg 627ec681f3Smrgdef generate_lava_yaml(args): 637ec681f3Smrg # General metadata and permissions, plus also inexplicably kernel arguments 647ec681f3Smrg values = { 657ec681f3Smrg 'job_name': 'mesa: {}'.format(args.pipeline_info), 667ec681f3Smrg 'device_type': args.device_type, 677ec681f3Smrg 'visibility': { 'group': [ args.visibility_group ] }, 687ec681f3Smrg 'priority': 75, 697ec681f3Smrg 'context': { 707ec681f3Smrg 'extra_nfsroot_args': ' init=/init rootwait minio_results={}'.format(args.job_artifacts_base) 717ec681f3Smrg }, 727ec681f3Smrg 'timeouts': { 737ec681f3Smrg 'job': { 747ec681f3Smrg 'minutes': args.job_timeout 757ec681f3Smrg } 767ec681f3Smrg }, 777ec681f3Smrg } 787ec681f3Smrg 797ec681f3Smrg if args.lava_tags: 807ec681f3Smrg values['tags'] = args.lava_tags.split(',') 817ec681f3Smrg 827ec681f3Smrg # URLs to our kernel rootfs to boot from, both generated by the base 837ec681f3Smrg # container build 847ec681f3Smrg deploy = { 857ec681f3Smrg 'timeout': { 'minutes': 10 }, 867ec681f3Smrg 'to': 'tftp', 877ec681f3Smrg 'os': 'oe', 887ec681f3Smrg 'kernel': { 897ec681f3Smrg 'url': '{}/{}'.format(args.base_system_url_prefix, args.kernel_image_name), 907ec681f3Smrg }, 917ec681f3Smrg 'nfsrootfs': { 927ec681f3Smrg 'url': '{}/lava-rootfs.tgz'.format(args.base_system_url_prefix), 937ec681f3Smrg 'compression': 'gz', 947ec681f3Smrg } 957ec681f3Smrg } 967ec681f3Smrg if args.kernel_image_type: 977ec681f3Smrg deploy['kernel']['type'] = args.kernel_image_type 987ec681f3Smrg if args.dtb: 997ec681f3Smrg deploy['dtb'] = { 1007ec681f3Smrg 'url': '{}/{}.dtb'.format(args.base_system_url_prefix, args.dtb) 1017ec681f3Smrg } 1027ec681f3Smrg 1037ec681f3Smrg # always boot over NFS 1047ec681f3Smrg boot = { 1057ec681f3Smrg 'timeout': { 'minutes': 25 }, 1067ec681f3Smrg 'method': args.boot_method, 1077ec681f3Smrg 'commands': 'nfs', 1087ec681f3Smrg 'prompts': ['lava-shell:'], 1097ec681f3Smrg } 1107ec681f3Smrg 1117ec681f3Smrg # skeleton test definition: only declaring each job as a single 'test' 1127ec681f3Smrg # since LAVA's test parsing is not useful to us 1137ec681f3Smrg test = { 1147ec681f3Smrg 'timeout': { 'minutes': args.job_timeout }, 1157ec681f3Smrg 'failure_retry': 1, 1167ec681f3Smrg 'definitions': [ { 1177ec681f3Smrg 'name': 'mesa', 1187ec681f3Smrg 'from': 'inline', 1197ec681f3Smrg 'path': 'inline/mesa.yaml', 1207ec681f3Smrg 'repository': { 1217ec681f3Smrg 'metadata': { 1227ec681f3Smrg 'name': 'mesa', 1237ec681f3Smrg 'description': 'Mesa test plan', 1247ec681f3Smrg 'os': [ 'oe' ], 1257ec681f3Smrg 'scope': [ 'functional' ], 1267ec681f3Smrg 'format': 'Lava-Test Test Definition 1.0', 1277ec681f3Smrg }, 1287ec681f3Smrg 'parse': { 1297ec681f3Smrg 'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))' 1307ec681f3Smrg }, 1317ec681f3Smrg 'run': { 1327ec681f3Smrg }, 1337ec681f3Smrg }, 1347ec681f3Smrg } ], 1357ec681f3Smrg } 1367ec681f3Smrg 1377ec681f3Smrg # job execution script: 1387ec681f3Smrg # - inline .gitlab-ci/common/init-stage1.sh 1397ec681f3Smrg # - fetch and unpack per-pipeline build artifacts from build job 1407ec681f3Smrg # - fetch and unpack per-job environment from lava-submit.sh 1417ec681f3Smrg # - exec .gitlab-ci/common/init-stage2.sh 1427ec681f3Smrg init_lines = [] 1437ec681f3Smrg with open(args.first_stage_init, 'r') as init_sh: 1447ec681f3Smrg init_lines += [ x.rstrip() for x in init_sh if not x.startswith('#') and x.rstrip() ] 1457ec681f3Smrg init_lines += [ 1467ec681f3Smrg 'mkdir -p {}'.format(args.ci_project_dir), 1477ec681f3Smrg 'wget -S --progress=dot:giga -O- {} | tar -xz -C {}'.format(args.mesa_build_url, args.ci_project_dir), 1487ec681f3Smrg 'wget -S --progress=dot:giga -O- {} | tar -xz -C /'.format(args.job_rootfs_overlay_url), 1497ec681f3Smrg 'set +x', 1507ec681f3Smrg 'export CI_JOB_JWT="{}"'.format(args.jwt), 1517ec681f3Smrg 'set -x', 1527ec681f3Smrg 'exec /init-stage2.sh', 1537ec681f3Smrg ] 1547ec681f3Smrg test['definitions'][0]['repository']['run']['steps'] = init_lines 1557ec681f3Smrg 1567ec681f3Smrg values['actions'] = [ 1577ec681f3Smrg { 'deploy': deploy }, 1587ec681f3Smrg { 'boot': boot }, 1597ec681f3Smrg { 'test': test }, 1607ec681f3Smrg ] 1617ec681f3Smrg 1627ec681f3Smrg return yaml.dump(values, width=10000000) 1637ec681f3Smrg 1647ec681f3Smrg 1657ec681f3Smrgdef setup_lava_proxy(): 1667ec681f3Smrg config = lavacli.load_config("default") 1677ec681f3Smrg uri, usr, tok = (config.get(key) for key in ("uri", "username", "token")) 1687ec681f3Smrg uri_obj = urllib.parse.urlparse(uri) 1697ec681f3Smrg uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path) 1707ec681f3Smrg transport = lavacli.RequestsTransport( 1717ec681f3Smrg uri_obj.scheme, 1727ec681f3Smrg config.get("proxy"), 1737ec681f3Smrg config.get("timeout", 120.0), 1747ec681f3Smrg config.get("verify_ssl_cert", True), 1757ec681f3Smrg ) 1767ec681f3Smrg proxy = xmlrpc.client.ServerProxy( 1777ec681f3Smrg uri_str, allow_none=True, transport=transport) 1787ec681f3Smrg 1797ec681f3Smrg print_log("Proxy for {} created.".format(config['uri'])) 1807ec681f3Smrg 1817ec681f3Smrg return proxy 1827ec681f3Smrg 1837ec681f3Smrg 1847ec681f3Smrgdef _call_proxy(fn, *args): 1857ec681f3Smrg retries = 60 1867ec681f3Smrg for n in range(1, retries + 1): 1877ec681f3Smrg try: 1887ec681f3Smrg return fn(*args) 1897ec681f3Smrg except xmlrpc.client.ProtocolError as err: 1907ec681f3Smrg if n == retries: 1917ec681f3Smrg traceback.print_exc() 1927ec681f3Smrg fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg)) 1937ec681f3Smrg else: 1947ec681f3Smrg time.sleep(15) 1957ec681f3Smrg pass 1967ec681f3Smrg except xmlrpc.client.Fault as err: 1977ec681f3Smrg traceback.print_exc() 1987ec681f3Smrg fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode)) 1997ec681f3Smrg 2007ec681f3Smrg 2017ec681f3Smrgdef get_job_results(proxy, job_id, test_suite, test_case): 2027ec681f3Smrg # Look for infrastructure errors and retry if we see them. 2037ec681f3Smrg results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id) 2047ec681f3Smrg results = yaml.load(results_yaml, Loader=loader(False)) 2057ec681f3Smrg for res in results: 2067ec681f3Smrg metadata = res['metadata'] 2077ec681f3Smrg if not 'result' in metadata or metadata['result'] != 'fail': 2087ec681f3Smrg continue 2097ec681f3Smrg if 'error_type' in metadata and metadata['error_type'] == "Infrastructure": 2107ec681f3Smrg print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id)) 2117ec681f3Smrg return False 2127ec681f3Smrg if 'case' in metadata and metadata['case'] == "validate": 2137ec681f3Smrg print_log("LAVA job {} failed validation (possible download error). Retry.".format(job_id)) 2147ec681f3Smrg return False 2157ec681f3Smrg 2167ec681f3Smrg results_yaml = _call_proxy(proxy.results.get_testcase_results_yaml, job_id, test_suite, test_case) 2177ec681f3Smrg results = yaml.load(results_yaml, Loader=loader(False)) 2187ec681f3Smrg if not results: 2197ec681f3Smrg fatal_err("LAVA: no result for test_suite '{}', test_case '{}'".format(test_suite, test_case)) 2207ec681f3Smrg 2217ec681f3Smrg print_log("LAVA: result for test_suite '{}', test_case '{}': {}".format(test_suite, test_case, results[0]['result'])) 2227ec681f3Smrg if results[0]['result'] != 'pass': 2237ec681f3Smrg fatal_err("FAIL") 2247ec681f3Smrg 2257ec681f3Smrg return True 2267ec681f3Smrg 2277ec681f3Smrgdef wait_until_job_is_started(proxy, job_id): 2287ec681f3Smrg print_log(f"Waiting for job {job_id} to start.") 2297ec681f3Smrg current_state = "Submitted" 2307ec681f3Smrg waiting_states = ["Submitted", "Scheduling", "Scheduled"] 2317ec681f3Smrg while current_state in waiting_states: 2327ec681f3Smrg job_state = _call_proxy(proxy.scheduler.job_state, job_id) 2337ec681f3Smrg current_state = job_state["job_state"] 2347ec681f3Smrg 2357ec681f3Smrg time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC) 2367ec681f3Smrg print_log(f"Job {job_id} started.") 2377ec681f3Smrg 2387ec681f3Smrgdef follow_job_execution(proxy, job_id): 2397ec681f3Smrg line_count = 0 2407ec681f3Smrg finished = False 2417ec681f3Smrg last_time_logs = datetime.now() 2427ec681f3Smrg while not finished: 2437ec681f3Smrg (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count) 2447ec681f3Smrg logs = yaml.load(str(data), Loader=loader(False)) 2457ec681f3Smrg if logs: 2467ec681f3Smrg # Reset the timeout 2477ec681f3Smrg last_time_logs = datetime.now() 2487ec681f3Smrg for line in logs: 2497ec681f3Smrg print("{} {}".format(line["dt"], line["msg"])) 2507ec681f3Smrg 2517ec681f3Smrg line_count += len(logs) 2527ec681f3Smrg 2537ec681f3Smrg else: 2547ec681f3Smrg time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN) 2557ec681f3Smrg if datetime.now() - last_time_logs > time_limit: 2567ec681f3Smrg print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id)) 2577ec681f3Smrg return False 2587ec681f3Smrg 2597ec681f3Smrg # `proxy.scheduler.jobs.logs` does not block, even when there is no 2607ec681f3Smrg # new log to be fetched. To avoid dosing the LAVA dispatcher 2617ec681f3Smrg # machine, let's add a sleep to save them some stamina. 2627ec681f3Smrg time.sleep(LOG_POLLING_TIME_SEC) 2637ec681f3Smrg 2647ec681f3Smrg return True 2657ec681f3Smrg 2667ec681f3Smrgdef show_job_data(proxy, job_id): 2677ec681f3Smrg show = _call_proxy(proxy.scheduler.jobs.show, job_id) 2687ec681f3Smrg for field, value in show.items(): 2697ec681f3Smrg print("{}\t: {}".format(field, value)) 2707ec681f3Smrg 2717ec681f3Smrg 2727ec681f3Smrgdef validate_job(proxy, job_file): 2737ec681f3Smrg try: 2747ec681f3Smrg return _call_proxy(proxy.scheduler.jobs.validate, job_file, True) 2757ec681f3Smrg except: 2767ec681f3Smrg return False 2777ec681f3Smrg 2787ec681f3Smrgdef submit_job(proxy, job_file): 2797ec681f3Smrg return _call_proxy(proxy.scheduler.jobs.submit, job_file) 2807ec681f3Smrg 2817ec681f3Smrg 2827ec681f3Smrgdef main(args): 2837ec681f3Smrg proxy = setup_lava_proxy() 2847ec681f3Smrg 2857ec681f3Smrg yaml_file = generate_lava_yaml(args) 2867ec681f3Smrg 2877ec681f3Smrg if args.dump_yaml: 2887ec681f3Smrg censored_args = args 2897ec681f3Smrg censored_args.jwt = "jwt-hidden" 2907ec681f3Smrg print(generate_lava_yaml(censored_args)) 2917ec681f3Smrg 2927ec681f3Smrg if args.validate_only: 2937ec681f3Smrg ret = validate_job(proxy, yaml_file) 2947ec681f3Smrg if not ret: 2957ec681f3Smrg fatal_err("Error in LAVA job definition") 2967ec681f3Smrg print("LAVA job definition validated successfully") 2977ec681f3Smrg return 2987ec681f3Smrg 2997ec681f3Smrg retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION 3007ec681f3Smrg 3017ec681f3Smrg while retry_count >= 0: 3027ec681f3Smrg job_id = submit_job(proxy, yaml_file) 3037ec681f3Smrg 3047ec681f3Smrg print_log("LAVA job id: {}".format(job_id)) 3057ec681f3Smrg 3067ec681f3Smrg wait_until_job_is_started(proxy, job_id) 3077ec681f3Smrg 3087ec681f3Smrg if not follow_job_execution(proxy, job_id): 3097ec681f3Smrg print_log(f"Job {job_id} has timed out. Cancelling it.") 3107ec681f3Smrg # Cancel the job as it is considered unreachable by Mesa CI. 3117ec681f3Smrg proxy.scheduler.jobs.cancel(job_id) 3127ec681f3Smrg 3137ec681f3Smrg retry_count -= 1 3147ec681f3Smrg continue 3157ec681f3Smrg 3167ec681f3Smrg show_job_data(proxy, job_id) 3177ec681f3Smrg 3187ec681f3Smrg if get_job_results(proxy, job_id, "0_mesa", "mesa") == True: 3197ec681f3Smrg break 3207ec681f3Smrg 3217ec681f3Smrg 3227ec681f3Smrgif __name__ == '__main__': 3237ec681f3Smrg # given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us -> 3247ec681f3Smrg # GitLab runner -> GitLab primary -> user, safe to say we don't need any 3257ec681f3Smrg # more buffering 3267ec681f3Smrg sys.stdout.reconfigure(line_buffering=True) 3277ec681f3Smrg sys.stderr.reconfigure(line_buffering=True) 3287ec681f3Smrg parser = argparse.ArgumentParser("LAVA job submitter") 3297ec681f3Smrg 3307ec681f3Smrg parser.add_argument("--pipeline-info") 3317ec681f3Smrg parser.add_argument("--base-system-url-prefix") 3327ec681f3Smrg parser.add_argument("--mesa-build-url") 3337ec681f3Smrg parser.add_argument("--job-rootfs-overlay-url") 3347ec681f3Smrg parser.add_argument("--job-artifacts-base") 3357ec681f3Smrg parser.add_argument("--job-timeout", type=int) 3367ec681f3Smrg parser.add_argument("--first-stage-init") 3377ec681f3Smrg parser.add_argument("--ci-project-dir") 3387ec681f3Smrg parser.add_argument("--device-type") 3397ec681f3Smrg parser.add_argument("--dtb", nargs='?', default="") 3407ec681f3Smrg parser.add_argument("--kernel-image-name") 3417ec681f3Smrg parser.add_argument("--kernel-image-type", nargs='?', default="") 3427ec681f3Smrg parser.add_argument("--boot-method") 3437ec681f3Smrg parser.add_argument("--lava-tags", nargs='?', default="") 3447ec681f3Smrg parser.add_argument("--jwt") 3457ec681f3Smrg parser.add_argument("--validate-only", action='store_true') 3467ec681f3Smrg parser.add_argument("--dump-yaml", action='store_true') 3477ec681f3Smrg parser.add_argument("--visibility-group") 3487ec681f3Smrg 3497ec681f3Smrg parser.set_defaults(func=main) 3507ec681f3Smrg args = parser.parse_args() 3517ec681f3Smrg args.func(args) 352