From 0f8e20bc73ea9f4f65aafbaa44ab4bd518eee4f4 Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Wed, 13 Nov 2024 12:21:41 +0100 Subject: [PATCH 1/9] Fix indentation in line KibbleBit.updateSource --- src/plugins/scanners/jira.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/plugins/scanners/jira.py b/src/plugins/scanners/jira.py index e996b7c..58a8daa 100644 --- a/src/plugins/scanners/jira.py +++ b/src/plugins/scanners/jira.py @@ -330,13 +330,13 @@ def scan(KibbleBit, source): except requests.exceptions.ConnectionError as err: KibbleBit.pprint("Connection error, skipping this ticket for now!") source['steps']['issues'] = { - 'time': time.time(), - 'status': 'Connection error occurred while scanning', - 'running': False, - 'good': False - } - KibbleBit.updateSource(source) - return + 'time': time.time(), + 'status': 'Connection error occurred while scanning', + 'running': False, + 'good': False + } + KibbleBit.updateSource(source) + return if 'issues' in js and len(js['issues']) == 1: key = js['issues'][0]['key'] m = re.search(r"-(\d+)$", key) From 1da9b2622537b0f0140d67e9a809cc5f46f4cbfd Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Thu, 14 Nov 2024 12:31:15 +0100 Subject: [PATCH 2/9] More configurable elasticsearch; add a versionhint in config.yaml for configuration initialization; Make KibbleWrapper classes private;i Allow EL 8.x configuration, currently (untested) basic authentication with options; Add debug traceback method in kibbleBit class; Fix pyyaml 5.1+ requires Loader=; --- conf/config.yaml | 1 + src/kibble-scanner.py | 2 +- src/plugins/brokers/kibbleES.py | 70 +++++++++++++++++++++++---------- 3 files changed, 51 insertions(+), 22 deletions(-) diff --git a/conf/config.yaml b/conf/config.yaml index d835539..98eb724 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -6,6 +6,7 @@ elasticsearch: ssl: false uri: "" database: kibble + versionHint: 8 # If enabled, kibble scanners will use the HTTP JSON API broker: diff --git a/src/kibble-scanner.py b/src/kibble-scanner.py index 6d67954..c8677ba 100644 --- a/src/kibble-scanner.py +++ b/src/kibble-scanner.py @@ -119,7 +119,7 @@ def main(): # Load config yaml if args.config: CONFIG_FILE = args.config - config = yaml.load(open(CONFIG_FILE)) + config = yaml.load(open(CONFIG_FILE), Loader=yaml.Loader) pprint("Loaded YAML config from %s" % CONFIG_FILE) # Which broker type do we use here? diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py index 6e158a0..aeaaf64 100644 --- a/src/plugins/brokers/kibbleES.py +++ b/src/plugins/brokers/kibbleES.py @@ -25,7 +25,7 @@ ACCEPTED_DB_VERSIONS = [1,2] # Versions we know how to work with. -class KibbleESWrapper(object): +class _KibbleESWrapper(object): """ Class for rewriting old-style queries to the new ones, where doc_type is an integral part of the DB name @@ -66,13 +66,17 @@ def __init__(self, ES): def exists(self, index): return self.ES.indices.exists(index = index) -class KibbleESWrapperSeven(object): +class _KibbleESWrapperSeven(object): """ Class for rewriting old-style queries to the new ones, where doc_type is an integral part of the DB name and NOT USED (>= 7.x) """ - def __init__(self, ES): - self.ES = ES + def __init__(self, ES, auth): + if (auth is not None): + self.ES = ES.options(basic_auth=auth) + else: + self.ES = ES + self.indices = self.indicesClass(ES) def get(self, index, doc_type, id): @@ -104,8 +108,8 @@ def __init__(self, ES): def exists(self, index): return self.ES.indices.exists(index = index) + - # This is redundant, refactor later? def pprint(string, err = False): @@ -144,9 +148,9 @@ def pprint(self, string, err = False): def updateSource(self, source): """ Updates a source document, usually with a status update """ self.broker.DB.index(index=self.broker.config['elasticsearch']['database'], - doc_type="source", - id=source['sourceID'], - body = source + doc_type="source", + id=source['sourceID'], + body = source ) def get(self, doctype, docid): @@ -189,14 +193,16 @@ def bulk(self): dbname = self.broker.config['elasticsearch']['database'] if self.broker.noTypes: dbname += "_%s" % js['doctype'] - js_arr.append({ + defaultJSON = { '_op_type': 'update' if js.get('upsert') else 'index', '_index': dbname, - '_type': '_doc', '_id': js['id'], 'doc' if js.get('upsert') else '_source': doc, 'doc_as_upsert': True, - }) + } + if self.broker.seven is False: + defaultJSON['_type'] = '_doc' + js_arr.append( defaultJSON ) else: js_arr.append({ '_op_type': 'update' if js.get('upsert') else 'index', @@ -210,6 +216,15 @@ def bulk(self): elasticsearch.helpers.bulk(self.broker.oDB, js_arr) except Exception as err: pprint("Warning: Could not bulk insert: %s" % err) + self.traceBack() + + def traceBack(): + err_type, err_value, tb = sys.exc_info() + traceback_output = ['API traceback:'] + traceback_output += traceback.format_tb(tb) + traceback_output.append('%s: %s' % (err_type.__name__, err_value)) + pprint("Error: traceback_output: %s" % (traceback_output)) + return traceback_output class KibbleOrganisation: @@ -279,14 +294,23 @@ def __init__(self, config): if 'user' in es_config: auth = (es_config['user'], es_config['password']) pprint("Connecting to ElasticSearch database at %s:%i..." % (es_config['hostname'], es_config.get('port', 9200))) - es = elasticsearch.Elasticsearch([{ + + defaultELConfig = { 'host': es_config['hostname'], - 'port': int(es_config.get('port', 9200)), - 'use_ssl': es_config.get('ssl', False), - 'verify_certs': False, - 'url_prefix': es_config.get('uri', ''), - 'http_auth': auth - }], + 'port': int(es_config.get('port', 9200)) + } + versionHint = config['elasticsearch']['versionHint'] + if (versionHint >= 7): + defaultELConfig['scheme'] = 'https' if (es_config['ssl']) else 'http' + defaultELConfig['path_prefix'] = es_config.get('uri', '') + # defaultELConfig['basic_auth'] = auth configured like .options(basic_auth=auth)).search + else: + defaultELConfig['use_ssl'] = es_config.get('ssl', False) + defaultELConfig['verify_certs'] = False + defaultELConfig['url_prefix'] = es_config.get('uri', '') + defaultELConfig['http_auth'] = auth + + es = elasticsearch.Elasticsearch([ defaultELConfig ], max_retries=5, retry_on_timeout=True ) @@ -299,13 +323,17 @@ def __init__(self, config): # This bit is required since ES 6.x and above don't like document types self.noTypes = True if int(es_info['version']['number'].split('.')[0]) >= 6 else False self.seven = True if int(es_info['version']['number'].split('.')[0]) >= 7 else False + self.eight = True if int(es_info['version']['number'].split('.')[0]) >= 8 else False if self.noTypes: pprint("This is a type-less DB, expanding database names instead.") - if self.seven: + if self.eight && auth is not None: + pprint("We're using ES >= 8.x, NO DOC_TYPE WITH BASIC_AUTH OPTIONS ") + es = _KibbleESWrapperSeven(es, auth) + elif self.seven: pprint("We're using ES >= 7.x, NO DOC_TYPE!") - es = KibbleESWrapperSeven(es) + es = _KibbleESWrapperSeven(es, None) else: - es = KibbleESWrapper(es) + es = _KibbleESWrapper(es) self.DB = es if not es.indices.exists(index = es_config['database'] + "_api"): sys.stderr.write("Could not find database group %s_* in ElasticSearch!\n" % es_config['database']) From 60b86be636283d91a661210acb6b58e1ffb6c9be Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Tue, 19 Nov 2024 14:26:24 +0100 Subject: [PATCH 3/9] Explain source argument restriction, allow sourceURL --- src/kibble-scanner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/kibble-scanner.py b/src/kibble-scanner.py index c8677ba..ac43477 100644 --- a/src/kibble-scanner.py +++ b/src/kibble-scanner.py @@ -38,7 +38,7 @@ def base_parser(): arg_parser.add_argument("-o", "--org", help="The organisation to gather stats for. If left out, all organisations will be scanned.") arg_parser.add_argument("-f", "--config", help="Location of the yaml config file (full path)") arg_parser.add_argument("-a", "--age", help="Minimum age in hours before performing a new scan on an already processed source. --age 12 will not process any source that was processed less than 12 hours ago, but will process new sources.") - arg_parser.add_argument("-s", "--source", help="A specific source (wildcard) to run scans on.") + arg_parser.add_argument("-s", "--source", help="A specific (existing in any org) source (wildcard) to run scans on.") arg_parser.add_argument("-n", "--nodes", help="Number of nodes in the cluster (used for load balancing)") arg_parser.add_argument("-t", "--type", help="Specific type of scanner to run (default is run all scanners)") arg_parser.add_argument("-e", "--exclude", nargs = '+', help="Specific type of scanner(s) to exclude") @@ -157,7 +157,8 @@ def main(): else: PENDING_OBJECTS = [] for source in org.sources(view=args.view): - if not args.source or (args.source == source['sourceID']): + #pprint("Checkng source %s" % source) + if not args.source or (args.source == source['sourceID']) or (args.source == source['sourceURL']): PENDING_OBJECTS.append(source) sourceNo += len(PENDING_OBJECTS) From 009103608317b8da39bab1dd547bc062d1efb42c Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Tue, 19 Nov 2024 14:26:56 +0100 Subject: [PATCH 4/9] Fix condition, add self argument for class method traceback --- src/plugins/brokers/kibbleES.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py index aeaaf64..a581f30 100644 --- a/src/plugins/brokers/kibbleES.py +++ b/src/plugins/brokers/kibbleES.py @@ -218,7 +218,7 @@ def bulk(self): pprint("Warning: Could not bulk insert: %s" % err) self.traceBack() - def traceBack(): + def traceBack(self): err_type, err_value, tb = sys.exc_info() traceback_output = ['API traceback:'] traceback_output += traceback.format_tb(tb) @@ -326,7 +326,7 @@ def __init__(self, config): self.eight = True if int(es_info['version']['number'].split('.')[0]) >= 8 else False if self.noTypes: pprint("This is a type-less DB, expanding database names instead.") - if self.eight && auth is not None: + if self.eight and auth is not None: pprint("We're using ES >= 8.x, NO DOC_TYPE WITH BASIC_AUTH OPTIONS ") es = _KibbleESWrapperSeven(es, auth) elif self.seven: From e7e1499790b5205134e45d3c26ad6f3ab214a2c6 Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Tue, 19 Nov 2024 14:29:28 +0100 Subject: [PATCH 5/9] Fix scan for github sources by adding empty steps key --- src/plugins/scanners/git-sync.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/plugins/scanners/git-sync.py b/src/plugins/scanners/git-sync.py index 5997f3e..234e33b 100644 --- a/src/plugins/scanners/git-sync.py +++ b/src/plugins/scanners/git-sync.py @@ -30,13 +30,14 @@ def accepts(source): """ Do we accept this source? """ if source['type'] == 'git': return True - # There are cases where we have a github repo, but don't wanna annalyze the code, just issues + # There are cases where we have a github repo, but don't wanna analyze the code, just issues if source['type'] == 'github' and source.get('issuesonly', False) == False: return True return False def scan(KibbleBit, source): + #KibbleBit.pprint("Scan source: %s." % source) # Get some vars, construct a data path for the repo path = source['sourceID'] url = source['sourceURL'] @@ -63,6 +64,8 @@ def scan(KibbleBit, source): KibbleBit.pprint("Checking out %s as %s" % (url, path)) try: + if 'steps' not in source: # initial fetch of a github repo may miss steps + source['steps'] = {} source['steps']['sync'] = { 'time': time.time(), 'status': 'Fetching code data from source location...', From 55f5d27313d696fd771522c61277739a36d2580f Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Fri, 22 Nov 2024 10:37:04 +0100 Subject: [PATCH 6/9] Github-issue: Fix reports from github with closed_at with None value; KibbleES: Fix missing traceback module and check result from el.helpers.bulk; Add print statements. --- src/plugins/brokers/kibbleES.py | 66 ++++++++++++++------------- src/plugins/scanners/github-issues.py | 13 +++--- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py index a581f30..28364b6 100644 --- a/src/plugins/brokers/kibbleES.py +++ b/src/plugins/brokers/kibbleES.py @@ -20,6 +20,7 @@ import elasticsearch.helpers import threading import sys +import traceback KIBBLE_DB_VERSION = 2 # Current DB struct version ACCEPTED_DB_VERSIONS = [1,2] # Versions we know how to work with. @@ -33,7 +34,7 @@ class _KibbleESWrapper(object): def __init__(self, ES): self.ES = ES self.indices = self.indicesClass(ES) - + def get(self, index, doc_type, id): return self.ES.get(index = index+'_'+doc_type, doc_type = '_doc', id = id) def exists(self, index, doc_type, id): @@ -57,12 +58,12 @@ def count(self, index, doc_type, body = None): doc_type = '_doc', body = body ) - + class indicesClass(object): """ Indices helper class """ def __init__(self, ES): self.ES = ES - + def exists(self, index): return self.ES.indices.exists(index = index) @@ -76,9 +77,9 @@ def __init__(self, ES, auth): self.ES = ES.options(basic_auth=auth) else: self.ES = ES - + self.indices = self.indicesClass(ES) - + def get(self, index, doc_type, id): return self.ES.get(index = index+'_'+doc_type, id = id) def exists(self, index, doc_type, id): @@ -100,15 +101,15 @@ def count(self, index, doc_type, body = None): index = index+'_'+doc_type, body = body ) - + class indicesClass(object): """ Indices helper class """ def __init__(self, ES): self.ES = ES - + def exists(self, index): return self.ES.indices.exists(index = index) - + # This is redundant, refactor later? @@ -121,7 +122,7 @@ def pprint(string, err = False): class KibbleBit: """ KibbleBit class with direct ElasticSearch access """ - + def __init__(self, broker, organisation, tid): self.config = broker.config self.organisation = organisation @@ -131,20 +132,20 @@ def __init__(self, broker, organisation, tid): self.pluginname = "" self.tid = tid self.dbname = self.broker.config['elasticsearch']['database'] - + def __del__(self): """ On unload/delete, push the last chunks of data to ES """ if self.json_queue: print("Pushing stragglers") self.bulk() - + def pprint(self, string, err = False): line = "[thread#%i:%s]: %s" % (self.tid, self.pluginname, string) if err: sys.stderr.write(line + "\n") else: print(line) - + def updateSource(self, source): """ Updates a source document, usually with a status update """ self.broker.DB.index(index=self.broker.config['elasticsearch']['database'], @@ -152,23 +153,23 @@ def updateSource(self, source): id=source['sourceID'], body = source ) - + def get(self, doctype, docid): """ Fetches a document from the DB """ doc = self.broker.DB.get(index=self.broker.config['elasticsearch']['database'], doc_type=doctype, id = docid) if doc: return doc['_source'] return None - + def exists(self, doctype, docid): """ Checks whether a document already exists or not """ return self.broker.DB.exists(index=self.broker.config['elasticsearch']['database'], doc_type=doctype, id = docid) - + def index(self, doctype, docid, document): """ Adds a new document to the index """ dbname = self.broker.config['elasticsearch']['database'] - self.broker.DB.index(index=dbname, doc_type = doctype, id = docid, body = document) - + self.broker.DB.index(index=dbname, doc_type = doctype, id = docid, body = document) + def append(self, t, doc): """ Append a document to the bulk push queue """ if not 'id' in doc: @@ -180,7 +181,7 @@ def append(self, t, doc): if len(self.json_queue) > self.queueMax: pprint("Bulk push forced") self.bulk() - + def bulk(self): """ Push pending JSON objects in the queue to ES""" xjson = self.json_queue @@ -193,6 +194,7 @@ def bulk(self): dbname = self.broker.config['elasticsearch']['database'] if self.broker.noTypes: dbname += "_%s" % js['doctype'] + #del doc['doctype'] defaultJSON = { '_op_type': 'update' if js.get('upsert') else 'index', '_index': dbname, @@ -213,28 +215,30 @@ def bulk(self): 'doc_as_upsert': True, }) try: - elasticsearch.helpers.bulk(self.broker.oDB, js_arr) + #print("Bulk insert JSON %s." % js_arr) + res = elasticsearch.helpers.bulk(self.broker.oDB, js_arr) + print("Result bulk: ", res) except Exception as err: pprint("Warning: Could not bulk insert: %s" % err) self.traceBack() - + def traceBack(self): err_type, err_value, tb = sys.exc_info() traceback_output = ['API traceback:'] traceback_output += traceback.format_tb(tb) traceback_output.append('%s: %s' % (err_type.__name__, err_value)) - pprint("Error: traceback_output: %s" % (traceback_output)) + print("Traceback: ", traceback_output ) return traceback_output - + class KibbleOrganisation: """ KibbleOrg with direct ElasticSearch access """ def __init__(self, broker, org): """ Init an org, set up ElasticSearch for KibbleBits later on """ - + self.broker = broker self.id = org - + def sources(self, sourceType = None, view = None): """ Get all sources or sources of a specific type for an org """ s = [] @@ -280,7 +284,7 @@ def sources(self, sourceType = None, view = None): } } ) - + for hit in res['hits']['hits']: if sourceType == None or hit['_source']['type'] == sourceType: s.append(hit['_source']) @@ -294,7 +298,7 @@ def __init__(self, config): if 'user' in es_config: auth = (es_config['user'], es_config['password']) pprint("Connecting to ElasticSearch database at %s:%i..." % (es_config['hostname'], es_config.get('port', 9200))) - + defaultELConfig = { 'host': es_config['hostname'], 'port': int(es_config.get('port', 9200)) @@ -309,7 +313,7 @@ def __init__(self, config): defaultELConfig['verify_certs'] = False defaultELConfig['url_prefix'] = es_config.get('uri', '') defaultELConfig['http_auth'] = auth - + es = elasticsearch.Elasticsearch([ defaultELConfig ], max_retries=5, retry_on_timeout=True @@ -352,11 +356,11 @@ def __init__(self, config): if apidoc['dbversion'] < KIBBLE_DB_VERSION: sys.stderr.write("The database '%s' uses an older structure format (version %u) than the scanners (version %u). Please upgrade your main Kibble server.\n" % (es_config['database'], apidoc['dbversion'], KIBBLE_DB_VERSION)) sys.exit(-1) - + def organisations(self): """ Return a list of all organisations """ orgs = [] - + # Run the search, fetch all orgs, 9999 max. TODO: Scroll??? res = self.DB.search( index=self.config['elasticsearch']['database'], @@ -368,10 +372,8 @@ def organisations(self): } } ) - + for hit in res['hits']['hits']: org = hit['_source']['id'] orgClass = KibbleOrganisation(self, org) yield orgClass - - diff --git a/src/plugins/scanners/github-issues.py b/src/plugins/scanners/github-issues.py index ebab0eb..bea1180 100644 --- a/src/plugins/scanners/github-issues.py +++ b/src/plugins/scanners/github-issues.py @@ -59,7 +59,7 @@ def make_issue(source, issue, people): owner_email = people[issue['user']['login']]['email'] issue_closer = owner_email - if 'closed_by' in issue: + if 'closed_by' in issue and issue['closed_by'] is not None: issue_closer = people[issue['closed_by']['login']] # Is this an issue ro a pull request? itype = "issue" @@ -116,7 +116,7 @@ def update_issue(KibbleBit, issue): def update_person(KibbleBit, person): person['upsert'] = True KibbleBit.append('person', person) - + def scan(KibbleBit, source, firstAttempt = True): auth=None @@ -160,7 +160,8 @@ def scan(KibbleBit, source, firstAttempt = True): people[issue['user']['login']] = person update_person(KibbleBit, person) - if 'closed_by' in issue and not issue['closed_by']['login'] in people: + #KibbleBit.pprint("issue: %s" % issue ) + if 'closed_by' in issue and issue['closed_by'] is not None and not issue['closed_by']['login'] in people: closer = make_person(source, issue, plugins.utils.github.user(issue['closed_by']['url'], auth=auth)) people[issue['closed_by']['login']] = closer @@ -176,7 +177,7 @@ def scan(KibbleBit, source, firstAttempt = True): continue update_issue(KibbleBit, doc) - + source['steps']['issues'] = { 'time': time.time(), 'status': 'Issue scan completed at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()), @@ -201,8 +202,8 @@ def scan(KibbleBit, source, firstAttempt = True): if plugins.utils.github.get_tokens_left(auth=auth) > 10: scan(KibbleBit, source, False) # If this one fails, bail completely return - - + + KibbleBit.pprint("HTTP Error, rate limit exceeded?") source['steps']['issues'] = { 'time': time.time(), From b8ef9a3f7ef63c8d6f1491f8d6b61f24cade6361 Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Wed, 27 Nov 2024 13:21:13 +0100 Subject: [PATCH 7/9] Add pre-commit configuration and docs; more accurate prints in bulk method, replace deprecated utcnow, use importlib for plugins.utils imports; fix in jira module non existing url variable to param source, fix in urlsmisc import base64 and remove unassigned variable te; remove unused imports in utils.git module; Update README.md In scanners jira,git-census,ponymail: Fix missing steps in source (might be true for other scanners, and root cause currently unknown). Fix exception in creds checking in jira. If es_doc variable has a key doc use this key in git-issues.py. --- .pre-commit-config.yaml | 31 ++++++++++ README.md | 35 ++++++++++- requirements.txt | 1 + src/plugins/brokers/kibbleES.py | 4 +- src/plugins/scanners/bugzilla.py | 55 +++++++++-------- src/plugins/scanners/buildbot.py | 65 ++++++++++---------- src/plugins/scanners/discourse.py | 87 ++++++++++++++------------- src/plugins/scanners/gerrit.py | 5 +- src/plugins/scanners/git-census.py | 17 +++--- src/plugins/scanners/git-evolution.py | 40 ++++++------ src/plugins/scanners/git-sloc.py | 27 +++++---- src/plugins/scanners/git-sync.py | 29 +++++---- src/plugins/scanners/github-issues.py | 26 ++++---- src/plugins/scanners/github-stats.py | 31 +++++----- src/plugins/scanners/jira.py | 64 +++++++++++--------- src/plugins/scanners/ponymail.py | 41 +++++++------ src/plugins/utils/git.py | 4 +- src/plugins/utils/urlmisc.py | 6 +- 18 files changed, 325 insertions(+), 243 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..88a4998 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +default_stages: [pre-commit, pre-push] +default_language_version: + # force all unspecified python hooks to run python3 + python: python3 +minimum_pre_commit_version: "3.4.0" + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace diff --git a/README.md b/README.md index f2c99cc..8e85410 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The Kibble Scanners collect information for the Kibble Suite. usage: kibble-scanner.py [-h] [-o ORG] [-f CONFIG] [-a AGE] [-s SOURCE] [-n NODES] [-t TYPE] [-e EXCLUDE [EXCLUDE ...]] [-v VIEW] - + optional arguments: -h, --help show this help message and exit -o ORG, --org ORG The organisation to gather stats for. If left out, all @@ -75,7 +75,36 @@ The Kibble Scanners collect information for the Kibble Suite. - python3-certifi - python3-yaml - + ## Build environment + +### Pre-commit + + After running + + pip install -r requirements.txt + + Run + + pre-commit install + + to install + + Run it explicitely by + + pre-commit + + to run the checks in .pre-commit-config.yaml + + If installed the pre-commit reads the configuration, and will check on the hooks, currently pre-comit and pre-push. + While the checks are not satisfied, just rerun the commit command until the hook checks are passed. + +### Testing + + TBD + +### Project build + + TBD + # Get involved TBD. Please see https://kibble.apache.org/ for details! - diff --git a/requirements.txt b/requirements.txt index 7db5a42..668c04d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ psutil python-dateutil requests pyyaml +pre-commit diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py index 28364b6..9c1d14a 100644 --- a/src/plugins/brokers/kibbleES.py +++ b/src/plugins/brokers/kibbleES.py @@ -215,10 +215,10 @@ def bulk(self): 'doc_as_upsert': True, }) try: - #print("Bulk insert JSON %s." % js_arr) res = elasticsearch.helpers.bulk(self.broker.oDB, js_arr) - print("Result bulk: ", res) + print("Result (success,failed): ", res) except Exception as err: + print("Error for INPUT JSON %s." % js_arr) pprint("Warning: Could not bulk insert: %s" % err) self.traceBack() diff --git a/src/plugins/scanners/bugzilla.py b/src/plugins/scanners/bugzilla.py index 4d9ca37..447af26 100644 --- a/src/plugins/scanners/bugzilla.py +++ b/src/plugins/scanners/bugzilla.py @@ -120,10 +120,10 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): else: pass #print("Ticket hasn't changed, ignoring...") - + if parseIt: KibbleBit.pprint("Parsing data from BugZilla for #%s" % key) - + params = { 'ids': [int(key)], 'limit': 0 @@ -132,7 +132,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): params['Bugzilla_login'] = source['creds']['username'] params['Bugzilla_password'] = source['creds']['password'] ticketsURL = "%s?method=Bug.get¶ms=[%s]" % (u, urllib.parse.quote(json.dumps(params))) - + js = plugins.utils.jsonapi.get(ticketsURL) js= js['result']['bugs'][0] creator = { @@ -162,17 +162,17 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): ticketsURL = "%s?method=Bug.comments¶ms=[%s]" % (u, urllib.parse.quote(json.dumps(params))) hjs = plugins.utils.jsonapi.get(ticketsURL) comments = len(hjs['result']['bugs'][str(key)]['comments']) - + title = bug['summary'] del params['ids'] if closer: - + pid = hashlib.sha1( ("%s%s" % (source['organisation'], closer['email'])).encode('ascii', errors='replace')).hexdigest() found = KibbleBit.exists('person', pid) if not found: params['names'] = [closer['email']] ticketsURL = "%s?method=User.get¶ms=[%s]" % (u, urllib.parse.quote(json.dumps(params))) - + try: ujs = plugins.utils.jsonapi.get(ticketsURL) displayName = ujs['result']['users'][0]['real_name'] @@ -180,7 +180,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): displayName = closer['email'] if displayName and len(displayName) > 0: # Add to people db - + jsp = { 'name': displayName, 'email': closer['email'], @@ -189,7 +189,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): } #print("Updating person DB for closer: %s (%s)" % (displayName, closerEmail)) KibbleBit.index('person', pid, jsp) - + if creator: pid = hashlib.sha1( ("%s%s" % (source['organisation'], creator['email'])).encode('ascii', errors='replace')).hexdigest() found = KibbleBit.exists('person', pid) @@ -204,7 +204,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): creator['name'] = creator['email'] if creator['name'] and len(creator['name']) > 0: # Add to people db - + jsp = { 'name': creator['name'], 'email': creator['email'], @@ -212,7 +212,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): 'id' :pid } KibbleBit.index('person', pid, jsp) - + jso = { 'id': dhash, 'key': key, @@ -223,7 +223,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): 'created': cd, 'closed': rd, 'issuetype': 'issue', - 'issueCloser': closer['email'] if 'email' in closer else None, + 'issueCloser': closer['email'] if 'email' in closer else None, 'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(cd)), 'closedDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd)) if rd else None, 'changeDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd if rd else cd)), @@ -238,8 +238,8 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom): except Exception as err: KibbleBit.pprint(err) return False - - + + class bzThread(Thread): @@ -252,10 +252,10 @@ def __init__(self, KibbleBit, source, block, pt, ot, u, dom): self.openTickets = ot self.u = u self.dom = dom - + def run(self): badOnes = 0 - + while len(self.pendingTickets) > 0 and badOnes <= 50: if len(self.pendingTickets) % 10 == 0: self.KibbleBit.pprint("%u elements left to count" % len(self.pendingTickets)) @@ -284,13 +284,16 @@ def run(self): return else: badOnes = 0 - + def scan(KibbleBit, source): path = source['sourceID'] url = source['sourceURL'] - + + if not 'steps' in source: + source['steps'] = {} + source['steps']['issues'] = { 'time': time.time(), 'status': 'Parsing BugZilla changes...', @@ -298,7 +301,7 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + bz = re.match(r"(https?://\S+?)(/jsonrpc\.cgi)?[\s:?]+(.+)", url) if bz: if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0: @@ -313,10 +316,10 @@ def scan(KibbleBit, source): u = "%s/jsonrpc.cgi" % dom instance = bz.group(3) lastTicket = 0 - + params = { 'product': [instance], - 'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"], + 'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"], 'include_fields': ['id', 'creation_time', 'status', 'summary', 'creator'], 'limit': 10000, 'offset': 1 @@ -324,12 +327,12 @@ def scan(KibbleBit, source): # If * is requested, just omit the product name if instance == '*': params = { - 'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"], + 'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"], 'include_fields': ['id', 'creation_time', 'status', 'summary', 'creator'], 'limit': 10000, 'offset': 1 } - + ticketsURL = "%s?method=Bug.search¶ms=[%s]" % (u, urllib.parse.quote(json.dumps(params))) while True: @@ -338,7 +341,7 @@ def scan(KibbleBit, source): except: KibbleBit.pprint("Couldn't fetch more tickets, bailing") break - + if len(js['result']['bugs']) > 0: KibbleBit.pprint("%s: Found %u tickets..." % (source['sourceURL'], ((params.get('offset', 1)-1) + len(js['result']['bugs'])))) for bug in js['result']['bugs']: @@ -350,7 +353,7 @@ def scan(KibbleBit, source): else: KibbleBit.pprint("No more tickets left to scan") break - + KibbleBit.pprint("Found %u open tickets, %u closed." % (len(openTickets), len(pendingTickets) - len(openTickets))) badOnes = 0 @@ -360,10 +363,10 @@ def scan(KibbleBit, source): t = bzThread(KibbleBit, source, block, pendingTickets, openTickets, u, dom) threads.append(t) t.start() - + for t in threads: t.join() - + source['steps']['issues'] = { 'time': time.time(), diff --git a/src/plugins/scanners/buildbot.py b/src/plugins/scanners/buildbot.py index b99f5bf..566e86c 100644 --- a/src/plugins/scanners/buildbot.py +++ b/src/plugins/scanners/buildbot.py @@ -30,7 +30,7 @@ """ title = "Scanner for Buildbot" -version = "0.1.0" +version = "0.1.1" def accepts(source): """ Determines whether we want to handle this source """ @@ -41,17 +41,17 @@ def accepts(source): def scanJob(KibbleBit, source, job, creds): """ Scans a single job for activity """ - NOW = int(datetime.datetime.utcnow().timestamp()) + NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceID'], job) ).encode('ascii', errors='replace')).hexdigest() found = True doc= None parseIt = False found = KibbleBit.exists('cijob', dhash) - + jobURL = "%s/api/v2/builders/%s/builds" % (source['sourceURL'], job) KibbleBit.pprint(jobURL) jobjson = plugins.utils.jsonapi.get(jobURL, auth = creds) - + # If valid JSON, ... if jobjson: for buildno, data in jobjson.items(): @@ -61,16 +61,16 @@ def scanJob(KibbleBit, source, job, creds): builddoc = KibbleBit.get('ci_build', buildhash) except: pass - + # If this build already completed, no need to parse it again if builddoc and builddoc.get('completed', False): continue - + KibbleBit.pprint("[%s-%s] This is new or pending, analyzing..." % (job, buildno)) - + completed = True if 'currentStep' in data else False - - + + # Get build status (success, failed, canceled etc) status = 'building' if 'successful' in data.get('text', []): @@ -79,7 +79,7 @@ def scanJob(KibbleBit, source, job, creds): status = 'failed' if 'exception' in data.get('text', []): status = 'aborted' - + DUR = 0 # Calc when the build finished if completed and len(data.get('times', [])) == 2 and data['times'][1]: @@ -87,7 +87,7 @@ def scanJob(KibbleBit, source, job, creds): DUR = FIN - data['times'][0] else: FIN = 0 - + doc = { # Build specific data 'id': buildhash, @@ -100,7 +100,7 @@ def scanJob(KibbleBit, source, job, creds): 'status': status, 'started': int(data['times'][0]), 'ci': 'buildbot', - + # Standard docs values 'sourceID': source['sourceID'], 'organisation': source['organisation'], @@ -109,7 +109,7 @@ def scanJob(KibbleBit, source, job, creds): KibbleBit.append('ci_build', doc) # Yay, it worked! return True - + # Boo, it failed! KibbleBit.pprint("Fetching job data failed!") return False @@ -124,7 +124,7 @@ def __init__(self, block, KibbleBit, source, creds, jobs): self.creds = creds self.source = source self.jobs = jobs - + def run(self): badOnes = 0 while len(self.jobs) > 0 and badOnes <= 50: @@ -158,7 +158,9 @@ def scan(KibbleBit, source): # Simple URL check buildbot = re.match(r"(https?://.+)", source['sourceURL']) if buildbot: - + if not 'steps' in source: + source['steps'] = {} + source['steps']['ci'] = { 'time': time.time(), 'status': 'Parsing Buildbot job changes...', @@ -166,7 +168,7 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + badOnes = 0 pendingJobs = [] KibbleBit.pprint("Parsing Buildbot activity at %s" % source['sourceURL']) @@ -177,22 +179,22 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - - # Buildbot may neeed credentials + + # Buildbot may need credentials creds = None if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0: creds = "%s:%s" % (source['creds']['username'], source['creds']['password']) - + # Get the job list sURL = source['sourceURL'] KibbleBit.pprint("Getting job list...") builders = plugins.utils.jsonapi.get("%s/api/v2/builders" % sURL , auth = creds) - + # Save queue snapshot - NOW = int(datetime.datetime.utcnow().timestamp()) + NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceID'], int(time.time())) ).encode('ascii', errors='replace')).hexdigest() - - + + # Scan queue items blocked = 0 stuck = 0 @@ -202,7 +204,7 @@ def scan(KibbleBit, source): actualQueueSize = 0 building = 0 jobs = [] - + for builder, data in builders.items(): jobs.append(builder) if data['state'] == 'building': @@ -217,8 +219,8 @@ def scan(KibbleBit, source): # Stuck builds (iow no builder available) if data['state'] == 'offline': stuck += data.get('pendingBuilds', 0) - - + + # Write up a queue doc queuedoc = { 'id': queuehash, @@ -229,16 +231,16 @@ def scan(KibbleBit, source): 'stuck': stuck, 'building': building, 'ci': 'buildbot', - + # Standard docs values 'sourceID': source['sourceID'], 'organisation': source['organisation'], 'upsert': True, } KibbleBit.append('ci_queue', queuedoc) - + KibbleBit.pprint("Found %u builders in Buildbot" % len(jobs)) - + threads = [] block = threading.Lock() KibbleBit.pprint("Scanning jobs using 4 sub-threads") @@ -246,11 +248,11 @@ def scan(KibbleBit, source): t = buildbotThread(block, KibbleBit, source, creds, jobs) threads.append(t) t.start() - + for t in threads: t.join() - # We're all done, yaay + # We're all done, yaay KibbleBit.pprint("Done scanning %s" % source['sourceURL']) source['steps']['ci'] = { @@ -260,4 +262,3 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - \ No newline at end of file diff --git a/src/plugins/scanners/discourse.py b/src/plugins/scanners/discourse.py index d160baa..9ecae03 100644 --- a/src/plugins/scanners/discourse.py +++ b/src/plugins/scanners/discourse.py @@ -30,7 +30,7 @@ """ title = "Scanner for Discourse Forums" -version = "0.1.0" +version = "0.1.1" def accepts(source): """ Determines whether we want to handle this source """ @@ -41,30 +41,30 @@ def accepts(source): def scanJob(KibbleBit, source, cat, creds): """ Scans a single discourse category for activity """ - NOW = int(datetime.datetime.utcnow().timestamp()) - + NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) + # Get $discourseURL/c/$catID - + catURL = os.path.join(source['sourceURL'], "c/%s" % cat['id']) KibbleBit.pprint("Scanning Discourse category '%s' at %s" % (cat['slug'], catURL)) - + page = 0 allUsers = {} - + # For each paginated result (up to page 100), check for changes while page < 100: pcatURL = "%s?page=%u" % (catURL, page) catjson = plugins.utils.jsonapi.get(pcatURL, auth = creds) page += 1 - - + + if catjson: - + # If we hit an empty list (no more topics), just break the loop. if not catjson['topic_list']['topics']: break - - # First (if we have data), we should store the known users + + # First (if we have data), we should store the known users # Since discourse hides the email (obviously!), we'll have to # fake one to generate an account. fakeDomain = "foo.discourse" @@ -75,7 +75,7 @@ def scanJob(KibbleBit, source, cat, creds): # Fake email address, compute deterministic ID email = "%s@%s" % (user['username'], fakeDomain) dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest() - + # Construct a very sparse user document userDoc = { 'id': dhash, @@ -83,29 +83,29 @@ def scanJob(KibbleBit, source, cat, creds): 'name': user['username'], 'email': email, } - + # Store user-ID-to-username mapping for later allUsers[user['id']] = userDoc - + # Store it (or, queue storage) unless it exists. # We don't wanna override better data, so we check if # it's there first. if not KibbleBit.exists('person', dhash): KibbleBit.append('person', userDoc) - + # Now, for each topic, we'll store a topic document for topic in catjson['topic_list']['topics']: - + # Calculate topic ID dhash = hashlib.sha224( ("%s-%s-topic-%s" % (source['organisation'], source['sourceURL'], topic['id']) ).encode('ascii', errors='replace')).hexdigest() - + # Figure out when topic was created and updated CreatedDate = datetime.datetime.strptime(topic['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp() if topic.get('last_posted_at'): UpdatedDate = datetime.datetime.strptime(topic['last_posted_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp() else: UpdatedDate = 0 - + # Determine whether we should scan this topic or continue to the next one. # We'll do this by seeing if the topic already exists and has no changes or not. if KibbleBit.exists('forum_topic', dhash): @@ -113,14 +113,14 @@ def scanJob(KibbleBit, source, cat, creds): # If update in the old doc was >= current update timestamp, skip the topic if fdoc['updated'] >= UpdatedDate: continue - - + + # Assuming we need to scan this, start by making the base topic document topicdoc = { 'id': dhash, 'sourceID': source['sourceID'], 'organisation': source['organisation'], - + 'type': 'discourse', 'category': cat['slug'], 'title': topic['title'], @@ -134,23 +134,23 @@ def scanJob(KibbleBit, source, cat, creds): 'views': topic['views'], 'url': source['sourceURL'] + "/t/%s/%s" % (topic['slug'], topic['id']) } - + KibbleBit.append('forum_topic', topicdoc) KibbleBit.pprint("%s is new or changed, scanning" % topicdoc['url']) - + # Now grab all the individual replies/posts # Remember to not have it count as a visit! pURL = "%s?track_visit=false&forceLoad=true" % topicdoc['url'] pjson = plugins.utils.jsonapi.get(pURL, auth = creds) - + posts = pjson['post_stream']['posts'] - + # For each post/reply, construct a forum_entry document KibbleBit.pprint("%s has %u posts" % (pURL, len(posts))) for post in posts: phash = hashlib.sha224( ("%s-%s-post-%s" % (source['organisation'], source['sourceURL'], post['id']) ).encode('ascii', errors='replace')).hexdigest() uname = post.get('name', post['username']) or post['username'] # Hack to get longest non-zero value - + # Find the hash of the person who posted it # We may know them, or we may have to store them. # If we have better info now (full name), re-store @@ -160,7 +160,7 @@ def scanJob(KibbleBit, source, cat, creds): # Same as before, fake email, store... email = "%s@%s" % (post['username'], fakeDomain) uhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest() - + # Construct a very sparse user document userDoc = { 'id': uhash, @@ -168,22 +168,22 @@ def scanJob(KibbleBit, source, cat, creds): 'name': uname, 'email': email, } - + # Store user-ID-to-username mapping for later allUsers[user['id']] = userDoc - + # Store it (or, queue storage) KibbleBit.append('person', userDoc) - + # Get post date CreatedDate = datetime.datetime.strptime(post['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp() - + # Store the post/reply document pdoc = { 'id': phash, 'sourceID': source['sourceID'], 'organisation': source['organisation'], - + 'type': 'discourse', 'creator': uhash, 'created': CreatedDate, @@ -209,7 +209,7 @@ def __init__(self, block, KibbleBit, source, creds, jobs): self.creds = creds self.source = source self.jobs = jobs - + def run(self): badOnes = 0 while len(self.jobs) > 0 and badOnes <= 50: @@ -243,7 +243,9 @@ def scan(KibbleBit, source): # Simple URL check discourse = re.match(r"(https?://.+)", source['sourceURL']) if discourse: - + if not 'steps' in source: + source['steps'] = {} + source['steps']['forum'] = { 'time': time.time(), 'status': 'Parsing Discourse topics...', @@ -251,7 +253,7 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + badOnes = 0 pendingJobs = [] KibbleBit.pprint("Parsing Discourse activity at %s" % source['sourceURL']) @@ -262,22 +264,22 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + # Discourse may neeed credentials (if basic auth) creds = None if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0: creds = "%s:%s" % (source['creds']['username'], source['creds']['password']) - + # Get the list of categories sURL = source['sourceURL'] KibbleBit.pprint("Getting categories...") catjs = plugins.utils.jsonapi.get("%s/categories_and_latest" % sURL , auth = creds) - + # Directly assign the category list as pending jobs queue, ezpz. pendingJobs = catjs['category_list']['categories'] - + KibbleBit.pprint("Found %u categories" % len(pendingJobs)) - + # Now fire off 4 threads to parse the categories threads = [] block = threading.Lock() @@ -286,11 +288,11 @@ def scan(KibbleBit, source): t = discourseThread(block, KibbleBit, source, creds, pendingJobs) threads.append(t) t.start() - + for t in threads: t.join() - # We're all done, yaay + # We're all done, yaay KibbleBit.pprint("Done scanning %s" % source['sourceURL']) source['steps']['forum'] = { @@ -300,4 +302,3 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - \ No newline at end of file diff --git a/src/plugins/scanners/gerrit.py b/src/plugins/scanners/gerrit.py index 86dd327..70148a1 100644 --- a/src/plugins/scanners/gerrit.py +++ b/src/plugins/scanners/gerrit.py @@ -154,6 +154,9 @@ def status_changed(stored_change, change): return stored_change['status'] != change['status'] def scan(KibbleBit, source): + if not 'steps' in source: + source['steps'] = {} + source['steps']['issues'] = { 'time': time.time(), 'status': 'Analyzing Gerrit tickets...', @@ -226,7 +229,7 @@ def scan(KibbleBit, source): except requests.HTTPError as e: print(e) - + source['steps']['issues'] = { 'time': time.time(), 'status': 'Done analyzing tickets!', diff --git a/src/plugins/scanners/git-census.py b/src/plugins/scanners/git-census.py index f5cc69d..b0327fc 100644 --- a/src/plugins/scanners/git-census.py +++ b/src/plugins/scanners/git-census.py @@ -50,8 +50,11 @@ def scan(KibbleBit, source): url = source['sourceURL'] rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation']) gpath = os.path.join(rootpath, rid) - - if 'steps' in source and source['steps']['sync']['good'] and os.path.exists(gpath): + + if not 'steps' in source: + source['steps'] = {} + + if source['steps']['sync']['good'] and os.path.exists(gpath): source['steps']['census'] = { 'time': time.time(), 'status': 'Census count started at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()), @@ -177,7 +180,7 @@ def scan(KibbleBit, source): # Make a list of changed files, max 1024 filelist = list(files_touched) filelist = filelist[:1023] - + # ES commit documents tsd = ts - (ts % 86400) js = { @@ -222,7 +225,7 @@ def scan(KibbleBit, source): 'organisation': source['organisation'], 'id' : hashlib.sha1( ("%s%s" % (source['organisation'], ce)).encode('ascii', errors='replace')).hexdigest() }) - KibbleBit.append ( 'person', + KibbleBit.append ( 'person', { 'upsert': True, 'name': an, @@ -234,7 +237,7 @@ def scan(KibbleBit, source): ) KibbleBit.append('code_commit', js) KibbleBit.append('code_commit_unique', jsx) - + if True: # Do file changes?? Might wanna make this optional KibbleBit.pprint("Scanning file changes for %s" % source['sourceURL']) for filename in modificationDates: @@ -257,7 +260,7 @@ def scan(KibbleBit, source): del jsfe['created'] del jsfe['createdDate'] KibbleBit.append('file_history', jsfe) - + source['steps']['census'] = { 'time': time.time(), 'status': 'Census count completed at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()), @@ -266,5 +269,3 @@ def scan(KibbleBit, source): } source['census'] = time.time() KibbleBit.updateSource(source) - - diff --git a/src/plugins/scanners/git-evolution.py b/src/plugins/scanners/git-evolution.py index 8ed648c..b9b6ca5 100644 --- a/src/plugins/scanners/git-evolution.py +++ b/src/plugins/scanners/git-evolution.py @@ -16,14 +16,13 @@ # limitations under the License. """ Git Evolution scanner """ +import importlib import os import subprocess -import re import time import calendar import datetime -import plugins.utils.git -import plugins.utils.sloc + import hashlib from collections import namedtuple @@ -68,7 +67,7 @@ def release(KibbleBit, source, status, exception=None, good=False): if exception: source['steps']['evolution'].update({'exception': exception}) KibbleBit.updateSource(source) - + def check_branch(gpath, date, branch): try: @@ -114,48 +113,48 @@ def find_branch(date, gpath): def scan(KibbleBit, source): - + rid = source['sourceID'] url = source['sourceURL'] rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation']) gpath = os.path.join(rootpath, rid) - + gname = source['sourceID'] KibbleBit.pprint("Doing evolution scan of %s" % gname) - + inp = get_first_ref(gpath) if inp: ts = int(inp.split()[0]) ts = ts - (ts % 86400) date = time.strftime("%Y-%b-%d 0:00", time.gmtime(ts)) - + #print("Starting from %s" % date) now = time.time() - + rid = source['sourceID'] url = source['sourceURL'] rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation']) gpath = os.path.join(rootpath, rid) - + if source['steps']['sync']['good'] and os.path.exists(gpath): acquire(KibbleBit, source) branch = find_branch(date, gpath) - + if not branch: release(source, "Could not do evolutionary scan of code", "No default branch was found in this repository") return - + branch_exists = check_branch(gpath, date, branch) - + if not branch_exists: KibbleBit.pprint("Not trunk either (bad repo?), skipping") release(source, "Could not do evolutionary scan of code", "No default branch was found in this repository") return - + try: - + d = time.gmtime(now) year = d[0] quarter = d[1] - (d[1] % 3) @@ -166,7 +165,7 @@ def scan(KibbleBit, source): pd = datetime.datetime(year, quarter, 1).replace(tzinfo=datetime.timezone.utc).timetuple() date = time.strftime("%Y-%b-%d 0:00", pd) unix = calendar.timegm(pd) - + # Skip the dates we've already processed dhash = hashlib.sha224((source['sourceID'] + date).encode('ascii', 'replace')).hexdigest() @@ -174,7 +173,8 @@ def scan(KibbleBit, source): if not found: checkout(gpath, date, branch) KibbleBit.pprint("Running cloc on %s (%s) at %s" % (gname, source['sourceURL'], date)) - languages, codecount, comment, blank, years, cost = plugins.utils.sloc.count(gpath) + sloc = importlib.import_module("plugins.utils.sloc") + languages, codecount, comment, blank, years, cost = sloc.count(gpath) js = { 'time': unix, 'sourceID': source['sourceID'], @@ -192,7 +192,7 @@ def scan(KibbleBit, source): if quarter <= 0: quarter += 12 year -= 1 - + # decrease month by 3 now = time.mktime(datetime.date(year, quarter, 1).timetuple()) except Exception as e: @@ -201,9 +201,7 @@ def scan(KibbleBit, source): time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()), str(e)) return - + release(KibbleBit, source, "Evolution scan completed at " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()), good=True) - - \ No newline at end of file diff --git a/src/plugins/scanners/git-sloc.py b/src/plugins/scanners/git-sloc.py index e2294f7..0b202ce 100644 --- a/src/plugins/scanners/git-sloc.py +++ b/src/plugins/scanners/git-sloc.py @@ -16,19 +16,17 @@ # limitations under the License. """ Source Lines of Code counter for Git """ - - +import importlib import os import sys import subprocess import time import shutil -import plugins.utils.git -import plugins.utils.sloc + import re title = "SloC Counter for Git" -version = "0.1.0" +version = "0.1.1" def accepts(source): """ Do we accept this source? """ @@ -40,12 +38,15 @@ def accepts(source): return False def scan(KibbleBit, source): - + rid = source['sourceID'] url = source['sourceURL'] rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation']) gpath = os.path.join(rootpath, rid) - + + if not 'steps' in source: + source['steps'] = {} + if source['steps']['sync']['good'] and os.path.exists(gpath): source['steps']['count'] = { 'time': time.time(), @@ -54,17 +55,19 @@ def scan(KibbleBit, source): 'good': True, } KibbleBit.updateSource(source) - + + git = importlib.import_module("plugins.utils.git") try: - branch = plugins.utils.git.defaultBranch(source, gpath) + branch = git.defaultBranch(source, gpath) subprocess.call('cd %s && git checkout %s' % (gpath, branch), shell = True) except: KibbleBit.pprint("SLoC counter failed to find main branch for %s!!" % url) return False - + KibbleBit.pprint("Running SLoC count for %s" % url) - languages, codecount, comment, blank, years, cost = plugins.utils.sloc.count(gpath) - + sloc = importlib.import_module("plugins.utils.sloc") + languages, codecount, comment, blank, years, cost = sloc.count(gpath) + sloc = { 'sourceID': source['sourceID'], 'loc': codecount, diff --git a/src/plugins/scanners/git-sync.py b/src/plugins/scanners/git-sync.py index 234e33b..64bec50 100644 --- a/src/plugins/scanners/git-sync.py +++ b/src/plugins/scanners/git-sync.py @@ -15,16 +15,16 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import sys import subprocess import time import shutil -import plugins.utils.git + +import plugins.utils.git as git title = "Sync plugin for Git repositories" -version = "0.1.2" +version = "0.1.3" def accepts(source): """ Do we accept this source? """ @@ -34,15 +34,15 @@ def accepts(source): if source['type'] == 'github' and source.get('issuesonly', False) == False: return True return False - + def scan(KibbleBit, source): - + #KibbleBit.pprint("Scan source: %s." % source) # Get some vars, construct a data path for the repo path = source['sourceID'] url = source['sourceURL'] rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation']) - + # If the root path does not exist, try to make it recursively. if not os.path.exists(rootpath): try: @@ -57,14 +57,14 @@ def scan(KibbleBit, source): } KibbleBit.updateSource(source) return - + # This is were the repo should be cloned datapath = os.path.join(rootpath, path) - + KibbleBit.pprint("Checking out %s as %s" % (url, path)) try: - if 'steps' not in source: # initial fetch of a github repo may miss steps + if 'steps' not in source: # initial fetch of a github repo may miss steps source['steps'] = {} source['steps']['sync'] = { 'time': time.time(), @@ -73,13 +73,13 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + # If we already checked this out earlier, just sync it. if os.path.exists(datapath): KibbleBit.pprint("Repo %s exists, fetching changes..." % datapath) - + # Do we have a default branch here? - branch = plugins.utils.git.defaultBranch(source, datapath, KibbleBit) + branch = git.defaultBranch(source, datapath, KibbleBit) if len(branch) == 0: source['default_branch'] = branch source['steps']['sync'] = { @@ -113,7 +113,7 @@ def scan(KibbleBit, source): fcommit = fcommit.decode('ascii').strip() subprocess.check_call("cd %s && git reset --hard %s" % (datapath, fcommit), shell = True, stderr=subprocess.STDOUT) try: - subprocess.check_call("cd %s && git clean -xfd" % datpath, shell = True, stderr=subprocess.STDOUT) + subprocess.check_call("cd %s && git clean -xfd" % datapath, shell = True, stderr=subprocess.STDOUT) except: pass # This is a new repo, clone it! @@ -133,7 +133,7 @@ def scan(KibbleBit, source): } KibbleBit.updateSource(source) return - + # All good, yay! source['steps']['sync'] = { 'time': time.time(), @@ -142,4 +142,3 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - diff --git a/src/plugins/scanners/github-issues.py b/src/plugins/scanners/github-issues.py index bea1180..f6737c4 100644 --- a/src/plugins/scanners/github-issues.py +++ b/src/plugins/scanners/github-issues.py @@ -14,13 +14,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import importlib import re import hashlib from dateutil import parser import time import requests -import plugins.utils.github title = "Scanner for GitHub Issues" version = "0.1.0" @@ -121,12 +120,14 @@ def update_person(KibbleBit, person): def scan(KibbleBit, source, firstAttempt = True): auth=None people = {} + github = importlib.import_module("plugins.utils.github") + if 'creds' in source: KibbleBit.pprint("Using auth for repo %s" % source['sourceURL']) creds = source['creds'] if creds and 'username' in creds: auth = (creds['username'], creds['password']) - TL = plugins.utils.github.get_tokens_left(auth=auth) + TL = github.get_tokens_left(auth=auth) KibbleBit.pprint("Scanning for GitHub issues (%u tokens left on GitHub)" % TL) # Have we scanned before? If so, only do a 3 month scan here. doneBefore = False @@ -143,11 +144,11 @@ def scan(KibbleBit, source, firstAttempt = True): if doneBefore: since = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(time.time() - (3*30*86400))) KibbleBit.pprint("Fetching changes since %s" % since) - issues = plugins.utils.github.get_all(source, plugins.utils.github.issues, + issues = github.get_all(source, github.issues, params={'filter': 'all', 'state':'all', 'since': since}, auth=auth) else: - issues = plugins.utils.github.get_all(source, plugins.utils.github.issues, + issues = github.get_all(source, github.issues, params={'filter': 'all', 'state':'all'}, auth=auth) KibbleBit.pprint("Fetched %s issues for %s" %(str(len(issues)), source['sourceURL'])) @@ -155,14 +156,14 @@ def scan(KibbleBit, source, firstAttempt = True): for issue in issues: if not issue['user']['login'] in people: - person = make_person(source, issue, plugins.utils.github.user(issue['user']['url'], + person = make_person(source, issue, github.user(issue['user']['url'], auth=auth)) people[issue['user']['login']] = person update_person(KibbleBit, person) #KibbleBit.pprint("issue: %s" % issue ) if 'closed_by' in issue and issue['closed_by'] is not None and not issue['closed_by']['login'] in people: - closer = make_person(source, issue, plugins.utils.github.user(issue['closed_by']['url'], + closer = make_person(source, issue, github.user(issue['closed_by']['url'], auth=auth)) people[issue['closed_by']['login']] = closer update_person(KibbleBit, closer) @@ -172,7 +173,10 @@ def scan(KibbleBit, source, firstAttempt = True): stored_change = None if KibbleBit.exists('issue', dhash): es_doc = KibbleBit.get('issue', dhash) - if not status_changed(es_doc, doc): + if 'doc' in es_doc: + es_doc = es_doc['doc'] + #KibbleBit.pprint("status %s seen %s." % ('status' in es_doc, 'status' in doc)) + if 'status' in es_doc and 'status' in doc and not status_changed(es_doc, doc): #KibbleBit.pprint("change %s seen already and status unchanged. Skipping." % issue['id']) continue @@ -190,16 +194,16 @@ def scan(KibbleBit, source, firstAttempt = True): # If we errored out because of rate limiting, retry later, otherwise bail if firstAttempt: sleeps = 0 - if plugins.utils.github.get_tokens_left(auth=auth) < 10: + if github.get_tokens_left(auth=auth) < 10: KibbleBit.pprint("Hit rate limits, trying to sleep it off!") - while plugins.utils.github.get_tokens_left(auth=auth) < 10: + while github.get_tokens_left(auth=auth) < 10: sleeps += 1 if sleeps > 24: KibbleBit.pprint("Slept for too long without finding a reset rate limit, giving up!") break time.sleep(300) # Sleep 5 min, then check again.. # If we have tokens, try one more time... - if plugins.utils.github.get_tokens_left(auth=auth) > 10: + if github.get_tokens_left(auth=auth) > 10: scan(KibbleBit, source, False) # If this one fails, bail completely return diff --git a/src/plugins/scanners/github-stats.py b/src/plugins/scanners/github-stats.py index 4ac933c..3c8d9f4 100644 --- a/src/plugins/scanners/github-stats.py +++ b/src/plugins/scanners/github-stats.py @@ -14,14 +14,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - +import hashlib +import importlib import os +import re import sys import subprocess import time import shutil -import plugins.utils.git + +from src.plugins.brokers.kibbleES import KibbleBit title = "Traffic statistics plugin for GitHub repositories" version = "0.1.0" @@ -31,17 +33,17 @@ def accepts(source): if source['type'] == 'github': return True return False - + def getTime(string): """ Convert GitHub timestamp to epoch """ return time.mktime(time.strptime(re.sub(r"Z", "", str(string)), "%Y-%m-%dT%H:%M:%S")) def scan(KibbletBit, source): - + # Get some vars, construct a data path for the repo path = source['sourceID'] url = source['sourceURL'] - + auth=None people = {} if 'creds' in source: @@ -60,12 +62,14 @@ def scan(KibbletBit, source): 'good': True } KibbletBit.updateSource(source) - + # Get views - views = plugins.utils.github.views(url, auth) + github = importlib.import_module("plugins.utils.github") + views = github.views(url, auth) if 'views' in views: for el in views['views']: ts = getTime(el['timestamp']) + #print("reformatted time:", ts) shash = hashlib.sha224( ("%s-%s-%s-clones" %(source['organisation'], url, el['timestamp'])).encode('ascii', errors = 'replace')).hexdigest() bit = { 'organisation': source['organisation'], @@ -78,9 +82,9 @@ def scan(KibbletBit, source): 'id': shash } KibbleBit.append('ghstats', bit) - + # Get clones - clones = plugins.utils.github.clones(url, auth) + clones = github.clones(url, auth) if 'clones' in clones: for el in clones['clones']: ts = getTime(el['timestamp']) @@ -96,12 +100,12 @@ def scan(KibbletBit, source): 'id': shash } KibbleBit.append('ghstats', bit) - + # Get referrers - refs = plugins.utils.github.referrers(url, auth) + refs = github.referrers(url, auth) if refs: for el in refs: - el['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S", time.time()) + el['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S", time) ts = getTime(el['timestamp']) shash = hashlib.sha224( ("%s-%s-%s-refs" %(source['organisation'], url, el['timestamp'])).encode('ascii', errors = 'replace')).hexdigest() bit = { @@ -118,4 +122,3 @@ def scan(KibbletBit, source): except: pass # All done! - \ No newline at end of file diff --git a/src/plugins/scanners/jira.py b/src/plugins/scanners/jira.py index 58a8daa..000c05b 100644 --- a/src/plugins/scanners/jira.py +++ b/src/plugins/scanners/jira.py @@ -14,13 +14,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import importlib import time import datetime import re import json import hashlib -import plugins.utils.jsonapi import threading import requests.exceptions @@ -36,7 +35,7 @@ def accepts(source): if source['type'] == 'jira': return True if source['type'] == "issuetracker": - jira = re.match(r"(https?://.+)/browse/([A-Z0-9]+)", url) + jira = re.match(r"(https?://.+)/browse/([A-Z0-9]+)", source) if jira: return True return False @@ -106,12 +105,12 @@ def pchange(js): def scanTicket(KibbleBit, key, u, source, creds, openTickets): """ Scans a single ticket for activity and people """ - + dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], key) ).encode('ascii', errors='replace')).hexdigest() found = True doc= None parseIt = False - + # the 'domain' var we try to figure out here is used # for faking email addresses and keep them unique, # in case JIRA has email visibility turned off. @@ -119,7 +118,7 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets): m = re.search(r"https?://([^/]+)", u) if m: domain = m.group(1) - + found = KibbleBit.exists('issue', dhash) if not found: KibbleBit.pprint("[%s] We've never seen this ticket before, parsing..." % key) @@ -139,13 +138,14 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets): KibbleBit.pprint("[%s] Ticket contains erroneous data from a previous scan, reparsing" % key) # This is just noise! #KibbleBit.pprint("[%s] Ticket hasn't changed, ignoring..." % key) - + if parseIt: KibbleBit.pprint("[%s] Parsing data from JIRA at %s..." % (key, domain)) queryURL = "%s/rest/api/2/issue/%s?fields=creator,reporter,status,issuetype,summary,assignee,resolutiondate,created,priority,changelog,comment,resolution,votes&expand=changelog" % (u, key) jiraURL = "%s/browse/%s" % (u, key) + jsonapi = importlib.import_module("plugins.utils.jsonapi") try: - tjson = plugins.utils.jsonapi.get(queryURL, auth = creds) + tjson = jsonapi.get(queryURL, auth = creds) if not tjson: KibbleBit.pprint("%s does not exist (404'ed)" % key) return False @@ -157,12 +157,12 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets): KibbleBit.pprint("Closed but no closer??") closerEmail = None status = 'closed' if st else 'open' - + # Make sure we actually have field data to work with if not tjson.get('fields') or not tjson['fields'].get('created'): KibbleBit.pprint("[%s] JIRA response is missing field data, ignoring ticket." % key) return False - + cd = getTime(tjson['fields']['created']) rd = getTime(tjson['fields']['resolutiondate']) if 'resolutiondate' in tjson['fields'] and tjson['fields']['resolutiondate'] else None comments = 0 @@ -190,7 +190,7 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets): 'upsert': True } KibbleBit.append('person', jsp) - + if creator: creator = creator.replace(" dot ", ".", 10).replace(" at ", "@", 1) if not '@' in creator: @@ -219,7 +219,7 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets): 'created': cd, 'closed': rd, 'issuetype': 'issue', - 'issueCloser': closerEmail, + 'issueCloser': closerEmail, 'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(cd)), 'closedDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd)) if rd else None, 'changeDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd if rd else cd)), @@ -234,8 +234,8 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets): #except Exception as err: #KibbleBit.pprint(err) #return False - - + + class jiraThread(threading.Thread): @@ -247,7 +247,7 @@ def __init__(self, block, KibbleBit, source, creds, pt, ot): self.source = source self.pendingTickets = pt self.openTickets = ot - + def run(self): badOnes = 0 while len(self.pendingTickets) > 0 and badOnes <= 50: @@ -281,13 +281,17 @@ def run(self): def scan(KibbleBit, source): jira = re.match(r"(https?://.+)/browse/([A-Z0-9]+)", source['sourceURL']) if jira: - + + if not 'steps' in source: + source['steps'] = {} + #print("issue source %s" % source ) # JIRA NEEDS credentials to do a proper scan! creds = None - if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0: + if 'creds' in source and source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0: creds = "%s:%s" % (source['creds']['username'], source['creds']['password']) if not creds: KibbleBit.pprint("JIRA at %s requires authentication, but none was found! Bailing." % source['sourceURL']) + source['steps']['issues'] = { 'time': time.time(), 'status': 'JIRA endpoint requires auth, but none was provided!', @@ -296,7 +300,7 @@ def scan(KibbleBit, source): } KibbleBit.updateSource(source) return - + source['steps']['issues'] = { 'time': time.time(), 'status': 'Parsing JIRA changes...', @@ -304,7 +308,7 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + badOnes = 0 jsa = [] jsp = [] @@ -317,16 +321,17 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + # Get base URL, list and domain to parse u = jira.group(1) instance = jira.group(2) lastTicket = 0 latestURL = "%s/rest/api/2/search?jql=project=%s+order+by+createdDate+DESC&fields=id,key&maxResults=1" % (u, instance) js = None - + + jsonapi = importlib.import_module("plugins.utils.jsonapi") try: - js = plugins.utils.jsonapi.get(latestURL, auth = creds) + js = jsonapi.get(latestURL, auth = creds) except requests.exceptions.ConnectionError as err: KibbleBit.pprint("Connection error, skipping this ticket for now!") source['steps']['issues'] = { @@ -342,8 +347,8 @@ def scan(KibbleBit, source): m = re.search(r"-(\d+)$", key) if m: lastTicket = int(m.group(1)) - - + + openTickets = [] startAt = 0 badTries = 0 @@ -351,7 +356,7 @@ def scan(KibbleBit, source): openURL = "%s/rest/api/2/search?jql=project=%s+and+status=open+order+by+createdDate+ASC&fields=id,key&maxResults=100&startAt=%u" % (u, instance, startAt) #print(openURL) try: - ojs = plugins.utils.jsonapi.get(openURL, auth = creds) + ojs = jsonapi.get(openURL, auth = creds) if not 'issues' in ojs or len(ojs['issues']) == 0: break for item in ojs['issues']: @@ -362,12 +367,12 @@ def scan(KibbleBit, source): KibbleBit.pprint("JIRA borked, retrying") badTries += 1 KibbleBit.pprint("Found %u open tickets" % len(openTickets)) - + badOnes = 0 for i in reversed(range(1,lastTicket+1)): key = "%s-%u" % (instance, i) pendingTickets.append([key, u, source]) - + threads = [] block = threading.Lock() KibbleBit.pprint("Scanning tickets using 4 sub-threads") @@ -375,10 +380,10 @@ def scan(KibbleBit, source): t = jiraThread(block, KibbleBit, source, creds, pendingTickets, openTickets) threads.append(t) t.start() - + for t in threads: t.join() - + KibbleBit.pprint("Done scanning %s" % source['sourceURL']) source['steps']['issues'] = { @@ -388,4 +393,3 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - diff --git a/src/plugins/scanners/ponymail.py b/src/plugins/scanners/ponymail.py index abe8fc1..fd9bca2 100644 --- a/src/plugins/scanners/ponymail.py +++ b/src/plugins/scanners/ponymail.py @@ -35,12 +35,12 @@ def accepts(source): # If the source equals the plugin name, assume a yes if source['type'] == 'ponymail': return True - + # If it's of type 'mail', check the URL if source['type'] == 'mail': if re.match(r"(https?://.+)/list\.html\?(.+)@(.+)", source['sourceURL']): return True - + # Default to not recognizing the source return False @@ -86,9 +86,12 @@ def scan(KibbleBit, source): } KibbleBit.updateSource(source) return - + # Pony Mail requires a UI cookie in order to work. Maked sure we have one! cookie = None + + if not 'steps' in source: + source['steps'] = {} if 'creds' in source and source['creds']: cookie = source['creds'].get('cookie', None) if not cookie: @@ -101,7 +104,7 @@ def scan(KibbleBit, source): } KibbleBit.updateSource(source) return - + # Notify scanner and DB that this is valid and we've begun parsing KibbleBit.pprint("%s is a valid Pony Mail address, parsing" % source['sourceURL']) source['steps']['mail'] = { @@ -111,13 +114,13 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - - + + # Get base URL, list and domain to parse u = url.group(1) l = url.group(2) d = url.group(3) - + # Get this month dt = time.gmtime(time.time()) firstYear = 1970 @@ -127,15 +130,15 @@ def scan(KibbleBit, source): month += 12 year -= 1 months = 0 - + # Hash for keeping records of who we know knowns = {} - + # While we have older archives, continue to parse while firstYear <= year: statsurl = "%s/api/stats.lua?list=%s&domain=%s&d=%s" % (u, l, d, "%04u-%02u" % (year, month)) dhash = hashlib.sha224((("%s %s") % (source['organisation'], statsurl)).encode('ascii', errors='replace')).hexdigest() - found = False + found = False if KibbleBit.exists('mailstats', dhash): found = True if months <= 1 or not found: # Always parse this month's stats :) @@ -147,7 +150,7 @@ def scan(KibbleBit, source): js = plugins.utils.jsonapi.get(statsurl, cookie = cookie) except Exception as err: KibbleBit.pprint("Server error, skipping this month") - month -= 1 + month -= 1 if month <= 0: month += 12 year -= 1 @@ -196,7 +199,7 @@ def scan(KibbleBit, source): 'id': mlhash } KibbleBit.index('mailtop', mlhash, jst) - + for email in js['emails']: sender = email['from'] name = sender @@ -214,7 +217,7 @@ def scan(KibbleBit, source): if KibbleBit.exists('person',sid): knowns[sender] = True if not sender in knowns or name != sender: - KibbleBit.append('person', + KibbleBit.append('person', { 'upsert': True, 'name': name, @@ -246,8 +249,8 @@ def scan(KibbleBit, source): KibbleBit.append('email', jse) for sender in posters: no_posters += 1 - - + + jso = { 'organisation': source['organisation'], 'sourceURL': source['sourceURL'], @@ -259,16 +262,16 @@ def scan(KibbleBit, source): } #print("Indexing as %s" % dhash) KibbleBit.index('mailstats', dhash, jso) - month -= 1 + month -= 1 if month <= 0: month += 12 year -= 1 - - + + source['steps']['mail'] = { 'time': time.time(), 'status': 'Mail archives successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())), 'running': False, 'good': True } - KibbleBit.updateSource(source) \ No newline at end of file + KibbleBit.updateSource(source) diff --git a/src/plugins/utils/git.py b/src/plugins/utils/git.py index 8576ce5..3bea67a 100644 --- a/src/plugins/utils/git.py +++ b/src/plugins/utils/git.py @@ -17,8 +17,6 @@ """ This is the Kibble git utility plugin """ -import os -import sys import subprocess import re @@ -30,7 +28,7 @@ def defaultBranch(source, datapath, KibbleBit = None): if KibbleBit and KibbleBit.config.get('git'): wanted_branches = KibbleBit.config['git'].get('wanted_branches', wanted_branches) foundBranch = False - + # For each wanted branch, in order, look for it in our clone, # and return the name if found. for B in wanted_branches: diff --git a/src/plugins/utils/urlmisc.py b/src/plugins/utils/urlmisc.py index 9e75a4b..3340bda 100644 --- a/src/plugins/utils/urlmisc.py +++ b/src/plugins/utils/urlmisc.py @@ -18,6 +18,7 @@ """ This is a Kibble miscellaneous URL functions plugin. """ +import base64 import urllib.request import gzip import tempfile @@ -53,9 +54,8 @@ def unzip(url, creds = None, cookie = None): if err.code != 404 and err.code != 401: tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False) subprocess.check_call(("/usr/bin/wget", "-O", tmpfile.name, url)) - + try: - te compressedFile = open("/tmp/kibbletmp.gz", 'rb') if (compressedFile.read(2) == '\x1f\x8b'): compressedFile.seek(0) @@ -72,4 +72,4 @@ def unzip(url, creds = None, cookie = None): tmpfile.flush() tmpfile.close() return tmpfile.name - return None \ No newline at end of file + return None From c043d6784a994d7e3233ca22b5006cbb041efa33 Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Mon, 2 Dec 2024 15:28:48 +0100 Subject: [PATCH 8/9] kibble-scanner should be run in modules (src) folder according to python practice, add some (commented) print statements for debugging, increase version from 0.10. to 0.2.0, fix deprecated utcnow to datetime.timezone.utc, add filter feature for jenkins to allow to check a single job in kibble-scanner.py; --- README.md | 2 +- src/kibble-scanner.py | 45 ++++++----- src/plugins/scanners/jenkins.py | 139 ++++++++++++++++++++------------ src/plugins/scanners/travis.py | 77 +++++++++--------- src/plugins/utils/jsonapi.py | 2 +- 5 files changed, 155 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index 8e85410..21cea0c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ The Kibble Scanners collect information for the Kibble Suite. ## How to run: - - On a daily/weekly/whatever basis, run: `python3 src/kibble-scanner.py`. + - On a daily/weekly/whatever basis, run in folder src: `python3 kibble-scanner.py`. ### Command line options: diff --git a/src/kibble-scanner.py b/src/kibble-scanner.py index ac43477..7b441bd 100644 --- a/src/kibble-scanner.py +++ b/src/kibble-scanner.py @@ -28,8 +28,8 @@ import plugins.brokers.kibbleES #import plugins.kibbleJSON -VERSION = "0.1.0" -CONFIG_FILE = "conf/config.yaml" +VERSION = "0.2.0" +CONFIG_FILE = "../conf/config.yaml" PENDING_OBJECTS = [] BIG_LOCK = threading.Lock() @@ -43,15 +43,16 @@ def base_parser(): arg_parser.add_argument("-t", "--type", help="Specific type of scanner to run (default is run all scanners)") arg_parser.add_argument("-e", "--exclude", nargs = '+', help="Specific type of scanner(s) to exclude") arg_parser.add_argument("-v", "--view", help="Specific source view to scan (default is scan all sources)") + arg_parser.add_argument("-j", "--filter", nargs='+', help="Jenkins-only: Filter the list of jobs (e.g. for debugging). To drill down to the target jobs, all nodes to the leaf node(s) are required, e.g --filter . Type is set to jenkins implicitely.") return arg_parser - + def pprint(string, err = False): line = "[core]: %s" % (string) if err: sys.stderr.write(line + "\n") else: print(line) - + def isMine(ID, config): if config['scanner'].get('balance', None): @@ -65,11 +66,11 @@ def isMine(ID, config): return True return False return True - + class scanThread(threading.Thread): """ A thread object that grabs an item from the queue and processes it, using whatever plugins will come out to play. """ - def __init__(self, broker, org, i, t = None, e = None): + def __init__(self, broker, org, i, t = None, e = None, f= None): super(scanThread, self).__init__() self.broker = broker self.org = org @@ -77,8 +78,12 @@ def __init__(self, broker, org, i, t = None, e = None): self.bit = self.broker.bitClass(self.broker, self.org, i) self.stype = t self.exclude = e + self.filter = f + # override + if self.filter: + self.stype = "jenkins" pprint("Initialized thread %i" % i) - + def run(self): global BIG_LOCK, PENDING_OBJECTS time.sleep(0.5) # Primarily to align printouts. @@ -89,6 +94,7 @@ def run(self): try: # Try grabbing an object (might not be any left!) obj = PENDING_OBJECTS.pop(0) + #print("object: %s" %(obj)) except: pass BIG_LOCK.release() @@ -97,14 +103,17 @@ def run(self): if isMine(obj['sourceID'], self.broker.config): # Run through list of scanners in order, apply when useful for sid, scanner in plugins.scanners.enumerate(): - + if scanner.accepts(obj): self.bit.pluginname = "plugins/scanners/" + sid # Excluded scanner type? if self.exclude and sid in self.exclude: continue + # specific jenkins filter + if self.stype and self.stype == sid and self.filter and sid == "jenkins": + scanner.scan(self.bit, obj, self.filter) # Specific scanner type or no types mentioned? - if not self.stype or self.stype == sid: + elif not self.stype or self.stype == sid: scanner.scan(self.bit, obj) else: break @@ -115,13 +124,13 @@ def main(): pprint("Kibble Scanner v/%s starting" % VERSION) global CONFIG_FILE, PENDING_OBJECTS args = base_parser().parse_args() - + # Load config yaml if args.config: CONFIG_FILE = args.config config = yaml.load(open(CONFIG_FILE), Loader=yaml.Loader) pprint("Loaded YAML config from %s" % CONFIG_FILE) - + # Which broker type do we use here? broker = None if 'elasticsearch' in config and config['elasticsearch'].get('enabled', False): @@ -130,14 +139,14 @@ def main(): else: pprint("Using HTTP JSON broker model") broker = plugins.brokers.kibbleJSON.Broker(config) - + orgNo = 0 sourceNo = 0 for org in broker.organisations(): if not args.org or args.org == org.id: pprint("Processing organisation %s" % org.id) orgNo += 1 - + # Compile source list # If --age is passed, only append source that either # have never been scanned, or have been scanned more than @@ -161,21 +170,21 @@ def main(): if not args.source or (args.source == source['sourceID']) or (args.source == source['sourceURL']): PENDING_OBJECTS.append(source) sourceNo += len(PENDING_OBJECTS) - + # Start up some threads equal to number of cores on the box, # but no more than 4. We don't want an IOWait nightmare. threads = [] core_count = min((4, int( multiprocessing.cpu_count() ))) for i in range(0, core_count): - sThread = scanThread(broker, org, i+1, args.type, args.exclude) + sThread = scanThread(broker, org, i+1, args.type, args.exclude, args.filter) sThread.start() threads.append(sThread) - + # Wait for them all to finish. for t in threads: t.join() - + pprint("All done scanning for now, found %i organisations and %i sources to process." % (orgNo, sourceNo)) - + if __name__ == '__main__': main() diff --git a/src/plugins/scanners/jenkins.py b/src/plugins/scanners/jenkins.py index c09920c..579ba8b 100644 --- a/src/plugins/scanners/jenkins.py +++ b/src/plugins/scanners/jenkins.py @@ -20,18 +20,21 @@ import re import json import hashlib -import plugins.utils.jsonapi + import threading import requests.exceptions import os import urllib.parse +from plugins.utils import jsonapi + + """ This is the Kibble Jenkins scanner plugin. """ title = "Scanner for Jenkins CI" -version = "0.1.0" +version = "0.2.0" def accepts(source): """ Determines whether we want to handle this source """ @@ -42,39 +45,43 @@ def accepts(source): def scanJob(KibbleBit, source, job, creds): """ Scans a single job for activity """ - NOW = int(datetime.datetime.utcnow().timestamp()) + NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) jname = job['name'] if job.get('folder'): jname = job.get('folder') + '-' + job['name'] - dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], jname) ).encode('ascii', errors='replace')).hexdigest() + dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], jname) ) + .encode('ascii', errors='replace')).hexdigest() found = True doc= None parseIt = False found = KibbleBit.exists('cijob', dhash) - + # Get $jenkins/job/$job-name/json... jobURL = "%s/api/json?depth=2&tree=builds[number,status,timestamp,id,result,duration]" % job['fullURL'] KibbleBit.pprint(jobURL) - jobjson = plugins.utils.jsonapi.get(jobURL, auth = creds) - + + jobjson = jsonapi.get(jobURL, auth = creds) + # If valid JSON, ... if jobjson: + print("jobjson builds: %s" %( jobjson)) for build in jobjson.get('builds', []): - buildhash = hashlib.sha224( ("%s-%s-%s-%s" % (source['organisation'], source['sourceURL'], jname, build['id']) ).encode('ascii', errors='replace')).hexdigest() + buildhash = hashlib.sha224( ("%s-%s-%s-%s" % (source['organisation'], source['sourceURL'], jname, build['id']) ) + .encode('ascii', errors='replace')).hexdigest() builddoc = None try: builddoc = KibbleBit.get('ci_build', buildhash) except: pass - + # If this build already completed, no need to parse it again if builddoc and builddoc.get('completed', False): continue - + KibbleBit.pprint("[%s-%s] This is new or pending, analyzing..." % (jname, build['id'])) - + completed = True if build['result'] else False - + # Estimate time spent in queue queuetime = 0 TS = int(build['timestamp']/1000) @@ -82,7 +89,7 @@ def scanJob(KibbleBit, source, job, creds): queuetime = builddoc.get('queuetime', 0) if not completed: queuetime = NOW - TS - + # Get build status (success, failed, canceled etc) status = 'building' if build['result'] in ['SUCCESS', 'STABLE']: @@ -91,13 +98,13 @@ def scanJob(KibbleBit, source, job, creds): status = 'failed' if build['result'] in ['ABORTED']: status = 'aborted' - + # Calc when the build finished (jenkins doesn't show this) if completed: FIN = int(build['timestamp'] + build['duration']) / 1000 else: FIN = 0 - + doc = { # Build specific data 'id': buildhash, @@ -111,7 +118,7 @@ def scanJob(KibbleBit, source, job, creds): 'started': int(build['timestamp']/1000), 'ci': 'jenkins', 'queuetime': queuetime, - + # Standard docs values 'sourceID': source['sourceID'], 'organisation': source['organisation'], @@ -120,7 +127,7 @@ def scanJob(KibbleBit, source, job, creds): KibbleBit.append('ci_build', doc) # Yay, it worked! return True - + # Boo, it failed! KibbleBit.pprint("Fetching job data failed!") return False @@ -135,7 +142,7 @@ def __init__(self, block, KibbleBit, source, creds, jobs): self.creds = creds self.source = source self.jobs = jobs - + def run(self): badOnes = 0 while len(self.jobs) > 0 and badOnes <= 50: @@ -160,7 +167,8 @@ def run(self): self.KibbleBit.pprint("Too many errors, bailing!") self.source['steps']['issues'] = { 'time': time.time(), - 'status': 'Too many errors while parsing at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())), + 'status': 'Too many errors while parsing at ' + + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())), 'running': False, 'good': False } @@ -169,11 +177,12 @@ def run(self): else: badOnes = 0 -def scan(KibbleBit, source): +def scan(KibbleBit, source, filter=None): # Simple URL check jenkins = re.match(r"(https?://.+)", source['sourceURL']) if jenkins: - + if not 'steps' in source: + source['steps'] = {} source['steps']['jenkins'] = { 'time': time.time(), 'status': 'Parsing Jenkins job changes...', @@ -181,7 +190,7 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + badOnes = 0 pendingJobs = [] KibbleBit.pprint("Parsing Jenkins activity at %s" % source['sourceURL']) @@ -192,32 +201,41 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + # Jenkins may neeed credentials creds = None - if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0: + if ('creds' in source and source['creds'] and 'username' in source['creds'] and source['creds']['username'] + and len(source['creds']['username']) > 0): creds = "%s:%s" % (source['creds']['username'], source['creds']['password']) - + + if not creds: + KibbleBit.pprint("JENKINS with no %s authentication." % source['sourceURL']) + # Get the job list - sURL = source['sourceURL'] - KibbleBit.pprint("Getting job list...") - jobsjs = plugins.utils.jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % sURL , auth = creds) - + sURL: str = source['sourceURL'] + #print("queue URL:", sURL) + KibbleBit.pprint("Getting jenkins job list..." ) + jobsjs = jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % sURL , auth = creds) + #print ("jobsjs:", jobsjs) + # Get the current queue + # This is always at the root of the build instance KibbleBit.pprint("Getting job queue...") - queuejs = plugins.utils.jsonapi.get("%s/queue/api/json?depth=1" % sURL , auth = creds) - + + queuejs = jsonapi.get("%s/queue/api/json?depth=1" % sURL , auth = creds) + # Save queue snapshot - NOW = int(datetime.datetime.utcnow().timestamp()) - queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceURL'], int(time.time())) ).encode('ascii', errors='replace')).hexdigest() - - + NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) + queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceURL'], int(time.time())) ) + .encode('ascii', errors='replace')).hexdigest() + + # Scan queue items blocked = 0 stuck = 0 totalqueuetime = 0 items = queuejs.get('items', []) - + for item in items: if item['blocked']: blocked += 1 @@ -225,11 +243,11 @@ def scan(KibbleBit, source): stuck += 1 if 'inQueueSince' in item: totalqueuetime += (NOW - int(item['inQueueSince']/1000)) - + avgqueuetime = totalqueuetime / max(1, len(items)) - + # Count how many jobs are building, find any folders... - actual_jobs, building = get_all_jobs(KibbleBit, source, jobsjs.get('jobs', []), creds) + actual_jobs, building = get_all_jobs(KibbleBit, source, jobsjs.get('jobs', []), filter, creds) # Write up a queue doc queuedoc = { @@ -242,18 +260,18 @@ def scan(KibbleBit, source): 'stuck': stuck, 'avgwait': avgqueuetime, 'ci': 'jenkins', - + # Standard docs values 'sourceID': source['sourceID'], 'organisation': source['organisation'], 'upsert': True, } KibbleBit.append('ci_queue', queuedoc) - - + + pendingJobs = actual_jobs KibbleBit.pprint("Found %u jobs in Jenkins" % len(pendingJobs)) - + threads = [] block = threading.Lock() KibbleBit.pprint("Scanning jobs using 4 sub-threads") @@ -261,32 +279,48 @@ def scan(KibbleBit, source): t = jenkinsThread(block, KibbleBit, source, creds, pendingJobs) threads.append(t) t.start() - + for t in threads: t.join() - # We're all done, yaay + # We're all done, yaay KibbleBit.pprint("Done scanning %s" % source['sourceURL']) + partial = "(filtered) " if filter else '' source['steps']['issues'] = { 'time': time.time(), - 'status': 'Jenkins successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())), + 'status': 'Jenkins successfully '+ partial+'scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())), 'running': False, 'good': True } KibbleBit.updateSource(source) - -def get_all_jobs(KibbleBit, source, joblist, creds): + +def get_all_jobs(KibbleBit, source, joblist, job_filter, creds): real_jobs = [] building = 0 for job in joblist: + + #print("jobFilter: ", job_filter) + if (job_filter and job['name'] not in job_filter): + print("Skipping job", job['name']) + continue + # Is this a job folder? jclass = job.get('_class') - if jclass in ['jenkins.branch.OrganizationFolder', 'org.jenkinsci.plugins.workflow.multibranch.WorkflowMultiBranchProject']: + + #KibbleBit.pprint("%s has class %s..." % (job['name'], jclass)) + + if jclass in ['jenkins.branch.OrganizationFolder', + 'org.jenkinsci.plugins.workflow.multibranch.WorkflowMultiBranchProject', + 'org.jenkinsci.plugins.workflow.job.WorkflowJob', + 'com.cloudbees.hudson.plugins.folder.Folder']: KibbleBit.pprint("%s is a jobs folder, expanding..." % job['name']) + csURL = '%s/job/%s' % (source['sourceURL'], urllib.parse.quote(job['name'].replace('/', '%2F'))) + try: - child_jobs = plugins.utils.jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % csURL, + + child_jobs = jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % csURL, auth=creds) csource = dict(source) csource['sourceURL'] = csURL @@ -294,7 +328,9 @@ def get_all_jobs(KibbleBit, source, joblist, creds): csource['folder'] = job['name'] else: csource['folder'] += '-' + job['name'] - cjobs, cbuilding = get_all_jobs(KibbleBit, csource, child_jobs.get('jobs', []), creds) + cjobs, cbuilding = get_all_jobs(KibbleBit, csource, child_jobs.get('jobs', []), job_filter, creds) + + KibbleBit.pprint("%s (job/folder) entries found." % (len(cjobs)) ) building += cbuilding for cjob in cjobs: real_jobs.append(cjob) @@ -308,5 +344,6 @@ def get_all_jobs(KibbleBit, source, joblist, creds): building += 1 job['fullURL'] = '%s/job/%s' % (source['sourceURL'], urllib.parse.quote(job['name'].replace('/', '%2F'))) job['folder'] = source.get('folder') + #KibbleBit.pprint("Found job %s ..." % job) real_jobs.append(job) return real_jobs, building diff --git a/src/plugins/scanners/travis.py b/src/plugins/scanners/travis.py index a42dae9..e6cfadb 100644 --- a/src/plugins/scanners/travis.py +++ b/src/plugins/scanners/travis.py @@ -41,19 +41,19 @@ def accepts(source): def scanJob(KibbleBit, source, bid, token, TLD): """ Scans a single job for activity """ - NOW = int(datetime.datetime.utcnow().timestamp()) + NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], bid) ).encode('ascii', errors='replace')).hexdigest() found = True doc= None parseIt = False found = KibbleBit.exists('cijob', dhash) - + # Get the job data pages = 0 offset = 0 last_page = False oURL = "https://api.travis-ci.%s/repo/%s/builds" % (TLD, bid) - + # For as long as pagination makes sense... while last_page == False: bURL = "https://api.travis-ci.%s/repo/%s/builds?limit=100&offset=%u" % (TLD, bid, offset) @@ -65,13 +65,13 @@ def scanJob(KibbleBit, source, bid, token, TLD): if repojs['@pagination']['is_last']: KibbleBit.pprint("Assuming this is the last page we need (travis says so)") last_page = True - + KibbleBit.pprint("%s has %u builds done" % (bURL, repojs['@pagination']['count'])) - + # BREAKER: If we go past count somehow, and travis doesn't say so, bork anyway if repojs['@pagination']['count'] < offset: return True - + offset += 100 for build in repojs.get('builds', []): buildID = build['id'] @@ -81,15 +81,15 @@ def scanJob(KibbleBit, source, bid, token, TLD): duration = build['duration'] completed = True if duration else False duration = duration or 0 - - + + buildhash = hashlib.sha224( ("%s-%s-%s-%s" % (source['organisation'], source['sourceURL'], bid, buildID) ).encode('ascii', errors='replace')).hexdigest() builddoc = None try: builddoc = KibbleBit.get('ci_build', buildhash) except: pass - + # If this build already completed, no need to parse it again if builddoc and builddoc.get('completed', False): # If we're on page > 1 and we've seen a completed build, assume @@ -99,7 +99,7 @@ def scanJob(KibbleBit, source, bid, token, TLD): last_page = True break continue - + # Get build status (success, failed, canceled etc) status = 'building' if build['state'] in ['finished', 'passed']: @@ -108,17 +108,17 @@ def scanJob(KibbleBit, source, bid, token, TLD): status = 'failed' if build['state'] in ['aborted', 'canceled']: status = 'aborted' - + FIN = 0 STA = 0 if finishedAt: FIN = datetime.datetime.strptime(finishedAt, "%Y-%m-%dT%H:%M:%SZ").timestamp() if startedAt: STA = int(datetime.datetime.strptime(startedAt, "%Y-%m-%dT%H:%M:%SZ").timestamp()) - + # We don't know how to calc queues yet, set to 0 queuetime = 0 - + doc = { # Build specific data 'id': buildhash, @@ -132,7 +132,7 @@ def scanJob(KibbleBit, source, bid, token, TLD): 'started': STA, 'ci': 'travis', 'queuetime': queuetime, - + # Standard docs values 'sourceID': source['sourceID'], 'organisation': source['organisation'], @@ -158,7 +158,7 @@ def __init__(self, block, KibbleBit, source, token, jobs, TLD): self.source = source self.jobs = jobs self.tld = TLD - + def run(self): badOnes = 0 while len(self.jobs) > 0 and badOnes <= 50: @@ -201,7 +201,7 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + badOnes = 0 pendingJobs = [] KibbleBit.pprint("Parsing Travis activity at %s" % source['sourceURL']) @@ -212,7 +212,7 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - + # Travis needs a token token = None if source['creds'] and 'token' in source['creds'] and source['creds']['token'] and len(source['creds']['token']) > 0: @@ -220,14 +220,14 @@ def scan(KibbleBit, source): else: KibbleBit.pprint("Travis CI requires a token to work!") return False - + # Get the job list, paginated sURL = source['sourceURL'] - + # Used for pagination jobs = 100 offset = 0 - + # Counters; builds queued, running and total jobs queued = 0 # We don't know how to count this yet building = 0 @@ -235,16 +235,16 @@ def scan(KibbleBit, source): blocked = 0 # Dunno how to count yet stuck = 0 # Ditto avgqueuetime = 0 # Ditto, fake it - + maybeQueued = [] while jobs == 100: URL = "https://api.travis-ci.%s/repos?repository.active=true&sort_by=current_build:desc&offset=%u&limit=100&include=repository.last_started_build" % (TLD, offset) offset += 100 r = requests.get(URL, headers = {'Travis-API-Version': '3', 'Authorization': "token %s" % token}) - + if r.status_code != 200: KibbleBit.pprint("Travis did not return a 200 Okay, bad token?!") - + source['steps']['travis'] = { 'time': time.time(), 'status': 'Travis CI scan failed at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time()) + ". Bad token??!"), @@ -253,8 +253,8 @@ def scan(KibbleBit, source): } KibbleBit.updateSource(source) return - - + + # For each build job js = r.json() for repo in js['repositories']: @@ -265,15 +265,15 @@ def scan(KibbleBit, source): if cb['state'] in ['started','created', 'queued', 'pending']: for job in cb.get('jobs', []): maybeQueued.append(job['id']) - - + + # Queue up build jobs for the threaded scanner bid = repo['id'] pendingJobs.append(bid) - + jobs = len(js['repositories']) KibbleBit.pprint("Scanned %u jobs..." % total) - + # Find out how many building and pending jobs for jobID in maybeQueued: URL = "https://api.travis-ci.%s/job/%u" % (TLD, jobID) @@ -288,11 +288,11 @@ def scan(KibbleBit, source): blocked += 1 # Queued in Travis generally means a job can't find an executor, and thus is blocked. KibbleBit.pprint("Job %u is pending" % jobID) KibbleBit.pprint("%u building, %u queued..." % (building, queued)) - + # Save queue snapshot - NOW = int(datetime.datetime.utcnow().timestamp()) + NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceURL'], int(time.time())) ).encode('ascii', errors='replace')).hexdigest() - + # Write up a queue doc queuedoc = { 'id': queuehash, @@ -304,17 +304,17 @@ def scan(KibbleBit, source): 'stuck': stuck, 'avgwait': avgqueuetime, 'ci': 'travis', - + # Standard docs values 'sourceID': source['sourceID'], 'organisation': source['organisation'], 'upsert': True, } KibbleBit.append('ci_queue', queuedoc) - - + + KibbleBit.pprint("Found %u jobs in Travis" % len(pendingJobs)) - + threads = [] block = threading.Lock() KibbleBit.pprint("Scanning jobs using 4 sub-threads") @@ -322,11 +322,11 @@ def scan(KibbleBit, source): t = travisThread(block, KibbleBit, source, token, pendingJobs, TLD) threads.append(t) t.start() - + for t in threads: t.join() - # We're all done, yaay + # We're all done, yaay KibbleBit.pprint("Done scanning %s" % source['sourceURL']) source['steps']['travis'] = { @@ -336,4 +336,3 @@ def scan(KibbleBit, source): 'good': True } KibbleBit.updateSource(source) - \ No newline at end of file diff --git a/src/plugins/utils/jsonapi.py b/src/plugins/utils/jsonapi.py index 04d173f..9405e2f 100644 --- a/src/plugins/utils/jsonapi.py +++ b/src/plugins/utils/jsonapi.py @@ -40,6 +40,7 @@ def get(url, cookie = None, auth = None, token = None, retries = 5, timeout = 30 headers["Authorization"] = "token %s" % token if cookie: headers["Cookie"] = cookie + # print("fetching url %s" % url) rv = requests.get(url, headers = headers, timeout = (CONNECT_TIMEOUT, timeout)) # Some services may be rate limited. We'll try sleeping it off in 60 second # intervals for a max of five minutes, then give up. @@ -85,4 +86,3 @@ def post(url, data, cookie = None, auth = None): rv = requests.post(url, headers = headers, json = data) js = rv.json() return js - From bc5df1b77faa2699e8343490dfd91c94f3d812e9 Mon Sep 17 00:00:00 2001 From: Georg Kallidis Date: Wed, 11 Dec 2024 16:19:38 +0100 Subject: [PATCH 9/9] Provide a simple build environment using setuptools in pyproject.toml; add info about packaging in README.md; Add conf/config.yaml.sample and ignore config.yaml in .gitignore. Fix pre_commit package name to underscore in requirements. --- .gitignore | 34 ++++++++++++++ README.md | 22 +++++++-- conf/{config.yaml => config.yaml.sample} | 0 pyproject.toml | 60 ++++++++++++++++++++++++ requirements.txt | 2 +- 5 files changed, 112 insertions(+), 6 deletions(-) create mode 100644 .gitignore rename conf/{config.yaml => config.yaml.sample} (100%) create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..83d5114 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +# Configuration file: +/conf/config.yaml + +# Distribution / packaging +build/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +*.egg-info/ +/dist/ +MANIFEST + +# Python cache / compiled files: +__pycache__/ +*.py[cod] + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Environments +/.venv*/ +/venv*/ +/.env*/ +/env*/ + + +# JetBrains IDE +/.idea/ diff --git a/README.md b/README.md index 21cea0c..9883b1e 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ The Kibble Scanners collect information for the Kibble Suite. - Edit conf/config.yaml to match your Kibble service +### Dependencies + + - a running Elasticsearch server + ## How to run: - On a daily/weekly/whatever basis, run in folder src: `python3 kibble-scanner.py`. @@ -75,7 +79,9 @@ The Kibble Scanners collect information for the Kibble Suite. - python3-certifi - python3-yaml - ## Build environment +### Testing + + TBD ### Pre-commit @@ -98,13 +104,19 @@ The Kibble Scanners collect information for the Kibble Suite. If installed the pre-commit reads the configuration, and will check on the hooks, currently pre-comit and pre-push. While the checks are not satisfied, just rerun the commit command until the hook checks are passed. -### Testing - - TBD ### Project build - TBD + After installation of the build tool + + pip install -q build + + build the project by running + + python -m build + + Find more information in project.toml file and [Setuptools](https://setuptools.pypa.io/). # Get involved + TBD. Please see https://kibble.apache.org/ for details! diff --git a/conf/config.yaml b/conf/config.yaml.sample similarity index 100% rename from conf/config.yaml rename to conf/config.yaml.sample diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..708dca3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,60 @@ +############################## +# Python packaging settings: # + +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[project] +name = "kibble-scanners" +description = "The Kibble Scanners collect information for the Kibble Suite. Apache Kibble is a tool to collect, aggregate and visualize data about any software project that uses commonly known tools." +version = "1.0.0" + +dependencies = [ + "python-dateutil", + "certifi", + "requests", + "psutil", + "elasticsearch", + "PyYAML>=5.2", +] + +requires-python = ">=3.9, <4.0" + +authors = [ + { name = "Apache Software Foundation", email = "dev@kibble.apache.org" }, +] +maintainers = [ + { name = "Apache Software Foundation", email="dev@kibble.apache.org" }, +] +keywords = [ + "kibble-scanners", "data" ] + +license = { text = "Apache License, Version 2.0" } +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Environment :: Console", + "Framework :: Apache Kibble-Scanners", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "Topic :: System :: Monitoring" +] + +dynamic = [ + "readme" +] + +[project.urls] +repository = "https://github.com/apache/kibble-scanners.git" +"Bug Tracker" = "https://github.com/apache/kibble-scanners/issues" + + +[tool.setuptools] +dynamic = { readme = { file = ["README.md"] } } +packages.find = { where = ["src"] } diff --git a/requirements.txt b/requirements.txt index 668c04d..6755b2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ psutil python-dateutil requests pyyaml -pre-commit +pre_commit