From 0f8e20bc73ea9f4f65aafbaa44ab4bd518eee4f4 Mon Sep 17 00:00:00 2001
From: Georg Kallidis <gk@apache.org>
Date: Wed, 13 Nov 2024 12:21:41 +0100
Subject: [PATCH 1/9] Fix indentation in line KibbleBit.updateSource

---
 src/plugins/scanners/jira.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/plugins/scanners/jira.py b/src/plugins/scanners/jira.py
index e996b7c..58a8daa 100644
--- a/src/plugins/scanners/jira.py
+++ b/src/plugins/scanners/jira.py
@@ -330,13 +330,13 @@ def scan(KibbleBit, source):
         except requests.exceptions.ConnectionError as err:
             KibbleBit.pprint("Connection error, skipping this ticket for now!")
             source['steps']['issues'] = {
-                 'time': time.time(),
-                 'status': 'Connection error occurred while scanning',
-                 'running': False,
-                 'good': False
-             }
-             KibbleBit.updateSource(source)
-             return
+                'time': time.time(),
+                'status': 'Connection error occurred while scanning',
+                'running': False,
+                'good': False
+            }
+            KibbleBit.updateSource(source)
+            return
         if 'issues' in js and len(js['issues']) == 1:
             key = js['issues'][0]['key']
             m = re.search(r"-(\d+)$", key)

From 1da9b2622537b0f0140d67e9a809cc5f46f4cbfd Mon Sep 17 00:00:00 2001
From: Georg Kallidis <georg.kallidis@fu-berlin.de>
Date: Thu, 14 Nov 2024 12:31:15 +0100
Subject: [PATCH 2/9] More configurable elasticsearch; add a versionhint in
 config.yaml for configuration initialization; Make KibbleWrapper classes
 private;i Allow EL 8.x configuration, currently (untested) basic
 authentication with options; Add debug traceback method in kibbleBit class;
 Fix pyyaml 5.1+ requires Loader=;

---
 conf/config.yaml                |  1 +
 src/kibble-scanner.py           |  2 +-
 src/plugins/brokers/kibbleES.py | 70 +++++++++++++++++++++++----------
 3 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/conf/config.yaml b/conf/config.yaml
index d835539..98eb724 100644
--- a/conf/config.yaml
+++ b/conf/config.yaml
@@ -6,6 +6,7 @@ elasticsearch:
     ssl:        false
     uri:        ""
     database:   kibble
+    versionHint: 8
 
 # If enabled, kibble scanners will use the HTTP JSON API
 broker:
diff --git a/src/kibble-scanner.py b/src/kibble-scanner.py
index 6d67954..c8677ba 100644
--- a/src/kibble-scanner.py
+++ b/src/kibble-scanner.py
@@ -119,7 +119,7 @@ def main():
     # Load config yaml
     if args.config:
         CONFIG_FILE = args.config
-    config = yaml.load(open(CONFIG_FILE))
+    config = yaml.load(open(CONFIG_FILE), Loader=yaml.Loader)
     pprint("Loaded YAML config from %s" % CONFIG_FILE)
     
     # Which broker type do we use here?
diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py
index 6e158a0..aeaaf64 100644
--- a/src/plugins/brokers/kibbleES.py
+++ b/src/plugins/brokers/kibbleES.py
@@ -25,7 +25,7 @@
 ACCEPTED_DB_VERSIONS = [1,2]  # Versions we know how to work with.
 
 
-class KibbleESWrapper(object):
+class _KibbleESWrapper(object):
     """
        Class for rewriting old-style queries to the new ones,
        where doc_type is an integral part of the DB name
@@ -66,13 +66,17 @@ def __init__(self, ES):
         def exists(self, index):
             return self.ES.indices.exists(index = index)
 
-class KibbleESWrapperSeven(object):
+class _KibbleESWrapperSeven(object):
     """
        Class for rewriting old-style queries to the new ones,
        where doc_type is an integral part of the DB name and NOT USED (>= 7.x)
     """
-    def __init__(self, ES):
-        self.ES = ES
+    def __init__(self, ES, auth):
+        if (auth is not None):
+            self.ES = ES.options(basic_auth=auth)
+        else:
+            self.ES = ES
+ 
         self.indices = self.indicesClass(ES)
     
     def get(self, index, doc_type, id):
@@ -104,8 +108,8 @@ def __init__(self, ES):
             
         def exists(self, index):
             return self.ES.indices.exists(index = index)
+            
 
-         
 
 # This is redundant, refactor later?
 def pprint(string, err = False):
@@ -144,9 +148,9 @@ def pprint(self,  string, err = False):
     def updateSource(self, source):
         """ Updates a source document, usually with a status update """
         self.broker.DB.index(index=self.broker.config['elasticsearch']['database'],
-                doc_type="source",
-                id=source['sourceID'],
-                body = source
+            doc_type="source",
+            id=source['sourceID'],
+            body = source
         )
         
     def get(self, doctype, docid):
@@ -189,14 +193,16 @@ def bulk(self):
             dbname = self.broker.config['elasticsearch']['database']
             if self.broker.noTypes:
                 dbname += "_%s" % js['doctype']
-                js_arr.append({
+                defaultJSON = {
                     '_op_type': 'update' if js.get('upsert') else 'index',
                     '_index': dbname,
-                    '_type': '_doc',
                     '_id': js['id'],
                     'doc' if js.get('upsert') else '_source': doc,
                     'doc_as_upsert': True,
-                })
+                }
+                if self.broker.seven is False:
+                    defaultJSON['_type'] = '_doc'
+                js_arr.append( defaultJSON )
             else:
                 js_arr.append({
                     '_op_type': 'update' if js.get('upsert') else 'index',
@@ -210,6 +216,15 @@ def bulk(self):
             elasticsearch.helpers.bulk(self.broker.oDB, js_arr)
         except Exception as err:
             pprint("Warning: Could not bulk insert: %s" % err)
+            self.traceBack()
+            
+    def traceBack():
+        err_type, err_value, tb = sys.exc_info()
+        traceback_output = ['API traceback:']
+        traceback_output += traceback.format_tb(tb)
+        traceback_output.append('%s: %s' % (err_type.__name__, err_value))
+        pprint("Error: traceback_output: %s" % (traceback_output)) 
+        return traceback_output
         
 
 class KibbleOrganisation:
@@ -279,14 +294,23 @@ def __init__(self, config):
         if 'user' in es_config:
             auth = (es_config['user'], es_config['password'])
         pprint("Connecting to ElasticSearch database at %s:%i..." % (es_config['hostname'], es_config.get('port', 9200)))
-        es = elasticsearch.Elasticsearch([{
+        
+        defaultELConfig = {
             'host': es_config['hostname'],
-            'port': int(es_config.get('port', 9200)),
-            'use_ssl': es_config.get('ssl', False),
-            'verify_certs': False,
-            'url_prefix': es_config.get('uri', ''),
-            'http_auth': auth
-        }],
+            'port': int(es_config.get('port', 9200))
+        }
+        versionHint = config['elasticsearch']['versionHint']
+        if (versionHint >= 7):
+            defaultELConfig['scheme'] = 'https' if (es_config['ssl']) else 'http'
+            defaultELConfig['path_prefix'] = es_config.get('uri', '')
+            # defaultELConfig['basic_auth'] = auth       configured like  .options(basic_auth=auth)).search
+        else:
+            defaultELConfig['use_ssl'] = es_config.get('ssl', False)
+            defaultELConfig['verify_certs'] = False
+            defaultELConfig['url_prefix'] = es_config.get('uri', '')
+            defaultELConfig['http_auth'] = auth
+        
+        es = elasticsearch.Elasticsearch([ defaultELConfig ],
             max_retries=5,
             retry_on_timeout=True
         )
@@ -299,13 +323,17 @@ def __init__(self, config):
         # This bit is required since ES 6.x and above don't like document types
         self.noTypes = True if int(es_info['version']['number'].split('.')[0]) >= 6 else False
         self.seven = True if int(es_info['version']['number'].split('.')[0]) >= 7 else False
+        self.eight = True if int(es_info['version']['number'].split('.')[0]) >= 8 else False
         if self.noTypes:
             pprint("This is a type-less DB, expanding database names instead.")
-            if self.seven:
+            if self.eight && auth is not None:
+                pprint("We're using ES >= 8.x, NO DOC_TYPE WITH BASIC_AUTH OPTIONS ")
+                es =  _KibbleESWrapperSeven(es, auth)
+            elif self.seven:
                 pprint("We're using ES >= 7.x, NO DOC_TYPE!")
-                es = KibbleESWrapperSeven(es)
+                es = _KibbleESWrapperSeven(es, None)
             else:
-                es = KibbleESWrapper(es)
+                es = _KibbleESWrapper(es)
             self.DB = es
             if not es.indices.exists(index = es_config['database'] + "_api"):
                 sys.stderr.write("Could not find database group %s_* in ElasticSearch!\n" % es_config['database'])

From 60b86be636283d91a661210acb6b58e1ffb6c9be Mon Sep 17 00:00:00 2001
From: Georg Kallidis <gk@apache.org>
Date: Tue, 19 Nov 2024 14:26:24 +0100
Subject: [PATCH 3/9] Explain source argument restriction, allow sourceURL

---
 src/kibble-scanner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/kibble-scanner.py b/src/kibble-scanner.py
index c8677ba..ac43477 100644
--- a/src/kibble-scanner.py
+++ b/src/kibble-scanner.py
@@ -38,7 +38,7 @@ def base_parser():
     arg_parser.add_argument("-o", "--org", help="The organisation to gather stats for. If left out, all organisations will be scanned.")
     arg_parser.add_argument("-f", "--config", help="Location of the yaml config file (full path)")
     arg_parser.add_argument("-a", "--age", help="Minimum age in hours before performing a new scan on an already processed source. --age 12 will not process any source that was processed less than 12 hours ago, but will process new sources.")
-    arg_parser.add_argument("-s", "--source", help="A specific source (wildcard) to run scans on.")
+    arg_parser.add_argument("-s", "--source", help="A specific (existing in any org) source (wildcard) to run scans on.")
     arg_parser.add_argument("-n", "--nodes", help="Number of nodes in the cluster (used for load balancing)")
     arg_parser.add_argument("-t", "--type", help="Specific type of scanner to run (default is run all scanners)")
     arg_parser.add_argument("-e", "--exclude", nargs = '+', help="Specific type of scanner(s) to exclude")
@@ -157,7 +157,8 @@ def main():
             else:
                 PENDING_OBJECTS = []
                 for source in org.sources(view=args.view):
-                    if not args.source or (args.source == source['sourceID']):
+                    #pprint("Checkng source %s" % source)
+                    if not args.source or (args.source == source['sourceID']) or (args.source == source['sourceURL']):
                         PENDING_OBJECTS.append(source)
                 sourceNo += len(PENDING_OBJECTS)
             

From 009103608317b8da39bab1dd547bc062d1efb42c Mon Sep 17 00:00:00 2001
From: Georg Kallidis <gk@apache.org>
Date: Tue, 19 Nov 2024 14:26:56 +0100
Subject: [PATCH 4/9] Fix condition, add self argument for class method
 traceback

---
 src/plugins/brokers/kibbleES.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py
index aeaaf64..a581f30 100644
--- a/src/plugins/brokers/kibbleES.py
+++ b/src/plugins/brokers/kibbleES.py
@@ -218,7 +218,7 @@ def bulk(self):
             pprint("Warning: Could not bulk insert: %s" % err)
             self.traceBack()
             
-    def traceBack():
+    def traceBack(self):
         err_type, err_value, tb = sys.exc_info()
         traceback_output = ['API traceback:']
         traceback_output += traceback.format_tb(tb)
@@ -326,7 +326,7 @@ def __init__(self, config):
         self.eight = True if int(es_info['version']['number'].split('.')[0]) >= 8 else False
         if self.noTypes:
             pprint("This is a type-less DB, expanding database names instead.")
-            if self.eight && auth is not None:
+            if self.eight and auth is not None:
                 pprint("We're using ES >= 8.x, NO DOC_TYPE WITH BASIC_AUTH OPTIONS ")
                 es =  _KibbleESWrapperSeven(es, auth)
             elif self.seven:

From e7e1499790b5205134e45d3c26ad6f3ab214a2c6 Mon Sep 17 00:00:00 2001
From: Georg Kallidis <gk@apache.org>
Date: Tue, 19 Nov 2024 14:29:28 +0100
Subject: [PATCH 5/9] Fix scan for github sources by adding empty steps key

---
 src/plugins/scanners/git-sync.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/plugins/scanners/git-sync.py b/src/plugins/scanners/git-sync.py
index 5997f3e..234e33b 100644
--- a/src/plugins/scanners/git-sync.py
+++ b/src/plugins/scanners/git-sync.py
@@ -30,13 +30,14 @@ def accepts(source):
     """ Do we accept this source? """
     if source['type'] == 'git':
         return True
-    # There are cases where we have a github repo, but don't wanna annalyze the code, just issues
+    # There are cases where we have a github repo, but don't wanna analyze the code, just issues
     if source['type'] == 'github' and source.get('issuesonly', False) == False:
         return True
     return False
     
 def scan(KibbleBit, source):
     
+    #KibbleBit.pprint("Scan source: %s." % source)
     # Get some vars, construct a data path for the repo
     path = source['sourceID']
     url = source['sourceURL']
@@ -63,6 +64,8 @@ def scan(KibbleBit, source):
     KibbleBit.pprint("Checking out %s as %s" % (url, path))
 
     try:
+        if 'steps' not in source: # initial fetch of a github repo may miss steps 
+            source['steps'] = {}
         source['steps']['sync'] = {
             'time': time.time(),
             'status': 'Fetching code data from source location...',

From 55f5d27313d696fd771522c61277739a36d2580f Mon Sep 17 00:00:00 2001
From: Georg Kallidis <georg.kallidis@fu-berlin.de>
Date: Fri, 22 Nov 2024 10:37:04 +0100
Subject: [PATCH 6/9] Github-issue: Fix reports from github with closed_at with
 None value; KibbleES: Fix missing traceback module and check result from
 el.helpers.bulk; Add print statements.

---
 src/plugins/brokers/kibbleES.py       | 66 ++++++++++++++-------------
 src/plugins/scanners/github-issues.py | 13 +++---
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py
index a581f30..28364b6 100644
--- a/src/plugins/brokers/kibbleES.py
+++ b/src/plugins/brokers/kibbleES.py
@@ -20,6 +20,7 @@
 import elasticsearch.helpers
 import threading
 import sys
+import traceback
 
 KIBBLE_DB_VERSION = 2  # Current DB struct version
 ACCEPTED_DB_VERSIONS = [1,2]  # Versions we know how to work with.
@@ -33,7 +34,7 @@ class _KibbleESWrapper(object):
     def __init__(self, ES):
         self.ES = ES
         self.indices = self.indicesClass(ES)
-    
+
     def get(self, index, doc_type, id):
         return self.ES.get(index = index+'_'+doc_type, doc_type = '_doc', id = id)
     def exists(self, index, doc_type, id):
@@ -57,12 +58,12 @@ def count(self, index, doc_type, body = None):
             doc_type = '_doc',
             body = body
             )
-    
+
     class indicesClass(object):
         """ Indices helper class """
         def __init__(self, ES):
             self.ES = ES
-            
+
         def exists(self, index):
             return self.ES.indices.exists(index = index)
 
@@ -76,9 +77,9 @@ def __init__(self, ES, auth):
             self.ES = ES.options(basic_auth=auth)
         else:
             self.ES = ES
- 
+
         self.indices = self.indicesClass(ES)
-    
+
     def get(self, index, doc_type, id):
         return self.ES.get(index = index+'_'+doc_type, id = id)
     def exists(self, index, doc_type, id):
@@ -100,15 +101,15 @@ def count(self, index, doc_type, body = None):
             index = index+'_'+doc_type,
             body = body
             )
-    
+
     class indicesClass(object):
         """ Indices helper class """
         def __init__(self, ES):
             self.ES = ES
-            
+
         def exists(self, index):
             return self.ES.indices.exists(index = index)
-            
+
 
 
 # This is redundant, refactor later?
@@ -121,7 +122,7 @@ def pprint(string, err = False):
 
 class KibbleBit:
     """ KibbleBit class with direct ElasticSearch access """
-    
+
     def __init__(self, broker, organisation, tid):
         self.config = broker.config
         self.organisation = organisation
@@ -131,20 +132,20 @@ def __init__(self, broker, organisation, tid):
         self.pluginname = ""
         self.tid = tid
         self.dbname = self.broker.config['elasticsearch']['database']
-    
+
     def __del__(self):
         """ On unload/delete, push the last chunks of data to ES """
         if self.json_queue:
             print("Pushing stragglers")
             self.bulk()
-            
+
     def pprint(self,  string, err = False):
         line = "[thread#%i:%s]: %s" % (self.tid, self.pluginname, string)
         if err:
             sys.stderr.write(line + "\n")
         else:
             print(line)
-        
+
     def updateSource(self, source):
         """ Updates a source document, usually with a status update """
         self.broker.DB.index(index=self.broker.config['elasticsearch']['database'],
@@ -152,23 +153,23 @@ def updateSource(self, source):
             id=source['sourceID'],
             body = source
         )
-        
+
     def get(self, doctype, docid):
         """ Fetches a document from the DB """
         doc = self.broker.DB.get(index=self.broker.config['elasticsearch']['database'], doc_type=doctype, id = docid)
         if doc:
             return doc['_source']
         return None
-    
+
     def exists(self, doctype, docid):
         """ Checks whether a document already exists or not """
         return self.broker.DB.exists(index=self.broker.config['elasticsearch']['database'], doc_type=doctype, id = docid)
-    
+
     def index(self, doctype, docid, document):
         """ Adds a new document to the index """
         dbname = self.broker.config['elasticsearch']['database']
-        self.broker.DB.index(index=dbname, doc_type = doctype, id = docid, body = document)        
-        
+        self.broker.DB.index(index=dbname, doc_type = doctype, id = docid, body = document)
+
     def append(self, t, doc):
         """ Append a document to the bulk push queue """
         if not 'id' in doc:
@@ -180,7 +181,7 @@ def append(self, t, doc):
         if len(self.json_queue) > self.queueMax:
             pprint("Bulk push forced")
             self.bulk()
-        
+
     def bulk(self):
         """ Push pending JSON objects in the queue to ES"""
         xjson = self.json_queue
@@ -193,6 +194,7 @@ def bulk(self):
             dbname = self.broker.config['elasticsearch']['database']
             if self.broker.noTypes:
                 dbname += "_%s" % js['doctype']
+                #del doc['doctype']
                 defaultJSON = {
                     '_op_type': 'update' if js.get('upsert') else 'index',
                     '_index': dbname,
@@ -213,28 +215,30 @@ def bulk(self):
                     'doc_as_upsert': True,
                 })
         try:
-            elasticsearch.helpers.bulk(self.broker.oDB, js_arr)
+            #print("Bulk insert JSON %s." % js_arr)
+            res = elasticsearch.helpers.bulk(self.broker.oDB, js_arr)
+            print("Result bulk: ", res)
         except Exception as err:
             pprint("Warning: Could not bulk insert: %s" % err)
             self.traceBack()
-            
+
     def traceBack(self):
         err_type, err_value, tb = sys.exc_info()
         traceback_output = ['API traceback:']
         traceback_output += traceback.format_tb(tb)
         traceback_output.append('%s: %s' % (err_type.__name__, err_value))
-        pprint("Error: traceback_output: %s" % (traceback_output)) 
+        print("Traceback: ", traceback_output )
         return traceback_output
-        
+
 
 class KibbleOrganisation:
     """ KibbleOrg with direct ElasticSearch access """
     def __init__(self, broker, org):
         """ Init an org, set up ElasticSearch for KibbleBits later on """
-        
+
         self.broker = broker
         self.id = org
-    
+
     def sources(self, sourceType = None, view = None):
         """ Get all sources or sources of a specific type for an org """
         s = []
@@ -280,7 +284,7 @@ def sources(self, sourceType = None, view = None):
                 }
             }
         )
-    
+
         for hit in res['hits']['hits']:
             if sourceType == None or hit['_source']['type'] == sourceType:
                 s.append(hit['_source'])
@@ -294,7 +298,7 @@ def __init__(self, config):
         if 'user' in es_config:
             auth = (es_config['user'], es_config['password'])
         pprint("Connecting to ElasticSearch database at %s:%i..." % (es_config['hostname'], es_config.get('port', 9200)))
-        
+
         defaultELConfig = {
             'host': es_config['hostname'],
             'port': int(es_config.get('port', 9200))
@@ -309,7 +313,7 @@ def __init__(self, config):
             defaultELConfig['verify_certs'] = False
             defaultELConfig['url_prefix'] = es_config.get('uri', '')
             defaultELConfig['http_auth'] = auth
-        
+
         es = elasticsearch.Elasticsearch([ defaultELConfig ],
             max_retries=5,
             retry_on_timeout=True
@@ -352,11 +356,11 @@ def __init__(self, config):
             if apidoc['dbversion'] < KIBBLE_DB_VERSION:
                 sys.stderr.write("The database '%s' uses an older structure format (version %u) than the scanners (version %u). Please upgrade your main Kibble server.\n" % (es_config['database'], apidoc['dbversion'], KIBBLE_DB_VERSION))
                 sys.exit(-1)
-    
+
     def organisations(self):
         """ Return a list of all organisations """
         orgs = []
-        
+
         # Run the search, fetch all orgs, 9999 max. TODO: Scroll???
         res = self.DB.search(
             index=self.config['elasticsearch']['database'],
@@ -368,10 +372,8 @@ def organisations(self):
                 }
             }
         )
-    
+
         for hit in res['hits']['hits']:
             org = hit['_source']['id']
             orgClass = KibbleOrganisation(self, org)
             yield orgClass
-        
-    
diff --git a/src/plugins/scanners/github-issues.py b/src/plugins/scanners/github-issues.py
index ebab0eb..bea1180 100644
--- a/src/plugins/scanners/github-issues.py
+++ b/src/plugins/scanners/github-issues.py
@@ -59,7 +59,7 @@ def make_issue(source, issue, people):
     owner_email = people[issue['user']['login']]['email']
 
     issue_closer = owner_email
-    if 'closed_by' in issue:
+    if 'closed_by' in issue and issue['closed_by'] is not None:
         issue_closer = people[issue['closed_by']['login']]
     # Is this an issue ro a pull request?
     itype = "issue"
@@ -116,7 +116,7 @@ def update_issue(KibbleBit, issue):
 def update_person(KibbleBit, person):
     person['upsert'] = True
     KibbleBit.append('person', person)
-    
+
 
 def scan(KibbleBit, source, firstAttempt = True):
     auth=None
@@ -160,7 +160,8 @@ def scan(KibbleBit, source, firstAttempt = True):
                 people[issue['user']['login']] = person
                 update_person(KibbleBit, person)
 
-            if 'closed_by' in issue and not issue['closed_by']['login'] in people:
+            #KibbleBit.pprint("issue: %s" % issue )
+            if 'closed_by' in issue and issue['closed_by'] is not None and not issue['closed_by']['login'] in people:
                 closer = make_person(source, issue, plugins.utils.github.user(issue['closed_by']['url'],
                                                           auth=auth))
                 people[issue['closed_by']['login']] = closer
@@ -176,7 +177,7 @@ def scan(KibbleBit, source, firstAttempt = True):
                     continue
 
             update_issue(KibbleBit, doc)
-        
+
         source['steps']['issues'] = {
             'time': time.time(),
             'status': 'Issue scan completed at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
@@ -201,8 +202,8 @@ def scan(KibbleBit, source, firstAttempt = True):
                 if plugins.utils.github.get_tokens_left(auth=auth) > 10:
                     scan(KibbleBit, source, False) # If this one fails, bail completely
                     return
-            
-            
+
+
         KibbleBit.pprint("HTTP Error, rate limit exceeded?")
         source['steps']['issues'] = {
             'time': time.time(),

From b8ef9a3f7ef63c8d6f1491f8d6b61f24cade6361 Mon Sep 17 00:00:00 2001
From: Georg Kallidis <gk@apache.org>
Date: Wed, 27 Nov 2024 13:21:13 +0100
Subject: [PATCH 7/9] Add pre-commit configuration and docs; more accurate
 prints in bulk method, replace deprecated utcnow, use importlib for
 plugins.utils imports; fix in jira module non existing url variable to param
 source, fix in urlsmisc import base64 and remove unassigned variable te;
 remove unused imports in utils.git module;

Update README.md

In scanners jira,git-census,ponymail: Fix missing steps in source (might be true for other scanners, and root cause currently unknown).

Fix exception in creds checking in jira.

If es_doc variable has a key doc use this key in git-issues.py.
---
 .pre-commit-config.yaml               | 31 ++++++++++
 README.md                             | 35 ++++++++++-
 requirements.txt                      |  1 +
 src/plugins/brokers/kibbleES.py       |  4 +-
 src/plugins/scanners/bugzilla.py      | 55 +++++++++--------
 src/plugins/scanners/buildbot.py      | 65 ++++++++++----------
 src/plugins/scanners/discourse.py     | 87 ++++++++++++++-------------
 src/plugins/scanners/gerrit.py        |  5 +-
 src/plugins/scanners/git-census.py    | 17 +++---
 src/plugins/scanners/git-evolution.py | 40 ++++++------
 src/plugins/scanners/git-sloc.py      | 27 +++++----
 src/plugins/scanners/git-sync.py      | 29 +++++----
 src/plugins/scanners/github-issues.py | 26 ++++----
 src/plugins/scanners/github-stats.py  | 31 +++++-----
 src/plugins/scanners/jira.py          | 64 +++++++++++---------
 src/plugins/scanners/ponymail.py      | 41 +++++++------
 src/plugins/utils/git.py              |  4 +-
 src/plugins/utils/urlmisc.py          |  6 +-
 18 files changed, 325 insertions(+), 243 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..88a4998
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+default_stages: [pre-commit, pre-push]
+default_language_version:
+  # force all unspecified python hooks to run python3
+  python: python3
+minimum_pre_commit_version: "3.4.0"
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
diff --git a/README.md b/README.md
index f2c99cc..8e85410 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The Kibble Scanners collect information for the Kibble Suite.
     usage: kibble-scanner.py [-h] [-o ORG] [-f CONFIG] [-a AGE] [-s SOURCE]
                              [-n NODES] [-t TYPE] [-e EXCLUDE [EXCLUDE ...]]
                              [-v VIEW]
-    
+
     optional arguments:
       -h, --help            show this help message and exit
       -o ORG, --org ORG     The organisation to gather stats for. If left out, all
@@ -75,7 +75,36 @@ The Kibble Scanners collect information for the Kibble Suite.
  - python3-certifi
  - python3-yaml
 
- 
+ ## Build environment
+
+### Pre-commit
+
+  After running
+
+    pip install -r requirements.txt
+
+  Run
+
+    pre-commit install
+
+  to install
+
+  Run it explicitely by
+
+    pre-commit
+
+  to run the checks in .pre-commit-config.yaml
+
+  If installed the pre-commit reads the configuration, and will check on the hooks, currently pre-comit and pre-push.
+  While the checks are not satisfied, just rerun the commit command until the hook checks are passed.
+
+### Testing
+
+  TBD
+
+### Project build
+
+  TBD
+
 # Get involved
   TBD. Please see https://kibble.apache.org/ for details!
-  
diff --git a/requirements.txt b/requirements.txt
index 7db5a42..668c04d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ psutil
 python-dateutil
 requests
 pyyaml
+pre-commit
diff --git a/src/plugins/brokers/kibbleES.py b/src/plugins/brokers/kibbleES.py
index 28364b6..9c1d14a 100644
--- a/src/plugins/brokers/kibbleES.py
+++ b/src/plugins/brokers/kibbleES.py
@@ -215,10 +215,10 @@ def bulk(self):
                     'doc_as_upsert': True,
                 })
         try:
-            #print("Bulk insert JSON %s." % js_arr)
             res = elasticsearch.helpers.bulk(self.broker.oDB, js_arr)
-            print("Result bulk: ", res)
+            print("Result (success,failed): ", res)
         except Exception as err:
+            print("Error for INPUT JSON %s." % js_arr)
             pprint("Warning: Could not bulk insert: %s" % err)
             self.traceBack()
 
diff --git a/src/plugins/scanners/bugzilla.py b/src/plugins/scanners/bugzilla.py
index 4d9ca37..447af26 100644
--- a/src/plugins/scanners/bugzilla.py
+++ b/src/plugins/scanners/bugzilla.py
@@ -120,10 +120,10 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
             else:
                 pass
                 #print("Ticket hasn't changed, ignoring...")
-            
+
         if parseIt:
             KibbleBit.pprint("Parsing data from BugZilla for #%s" % key)
-            
+
             params = {
                 'ids':  [int(key)],
                 'limit': 0
@@ -132,7 +132,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
                 params['Bugzilla_login'] = source['creds']['username']
                 params['Bugzilla_password'] = source['creds']['password']
             ticketsURL = "%s?method=Bug.get&params=[%s]" % (u, urllib.parse.quote(json.dumps(params)))
-            
+
             js = plugins.utils.jsonapi.get(ticketsURL)
             js= js['result']['bugs'][0]
             creator = {
@@ -162,17 +162,17 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
             ticketsURL = "%s?method=Bug.comments&params=[%s]" % (u, urllib.parse.quote(json.dumps(params)))
             hjs = plugins.utils.jsonapi.get(ticketsURL)
             comments = len(hjs['result']['bugs'][str(key)]['comments'])
-            
+
             title = bug['summary']
             del params['ids']
             if closer:
-                
+
                 pid = hashlib.sha1( ("%s%s" % (source['organisation'], closer['email'])).encode('ascii', errors='replace')).hexdigest()
                 found = KibbleBit.exists('person', pid)
                 if not found:
                     params['names'] = [closer['email']]
                     ticketsURL = "%s?method=User.get&params=[%s]" % (u, urllib.parse.quote(json.dumps(params)))
-                    
+
                     try:
                         ujs = plugins.utils.jsonapi.get(ticketsURL)
                         displayName = ujs['result']['users'][0]['real_name']
@@ -180,7 +180,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
                         displayName = closer['email']
                     if displayName and len(displayName) > 0:
                         # Add to people db
-                        
+
                         jsp = {
                             'name': displayName,
                             'email': closer['email'],
@@ -189,7 +189,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
                         }
                         #print("Updating person DB for closer: %s (%s)" % (displayName, closerEmail))
                         KibbleBit.index('person', pid, jsp)
-                    
+
             if creator:
                 pid = hashlib.sha1( ("%s%s" % (source['organisation'], creator['email'])).encode('ascii', errors='replace')).hexdigest()
                 found = KibbleBit.exists('person', pid)
@@ -204,7 +204,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
                             creator['name'] = creator['email']
                     if creator['name'] and len(creator['name']) > 0:
                         # Add to people db
-                        
+
                         jsp = {
                             'name': creator['name'],
                             'email': creator['email'],
@@ -212,7 +212,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
                             'id' :pid
                         }
                         KibbleBit.index('person', pid, jsp)
-                
+
             jso = {
                 'id': dhash,
                 'key': key,
@@ -223,7 +223,7 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
                 'created': cd,
                 'closed': rd,
                 'issuetype': 'issue',
-                'issueCloser': closer['email'] if 'email' in closer else None, 
+                'issueCloser': closer['email'] if 'email' in closer else None,
                 'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(cd)),
                 'closedDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd)) if rd else None,
                 'changeDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd if rd else cd)),
@@ -238,8 +238,8 @@ def scanTicket(bug, KibbleBit, source, openTickets, u, dom):
     except Exception as err:
         KibbleBit.pprint(err)
         return False
-        
-    
+
+
 
 class bzThread(Thread):
 
@@ -252,10 +252,10 @@ def __init__(self, KibbleBit, source, block, pt, ot, u, dom):
         self.openTickets = ot
         self.u = u
         self.dom = dom
-        
+
     def run(self):
         badOnes = 0
-        
+
         while len(self.pendingTickets) > 0 and badOnes <= 50:
             if len(self.pendingTickets) % 10 == 0:
                 self.KibbleBit.pprint("%u elements left to count" % len(self.pendingTickets))
@@ -284,13 +284,16 @@ def run(self):
                     return
             else:
                 badOnes = 0
-                
+
 
 
 def scan(KibbleBit, source):
     path = source['sourceID']
     url = source['sourceURL']
-    
+
+    if not 'steps' in source:
+        source['steps'] = {}
+
     source['steps']['issues'] = {
         'time': time.time(),
         'status': 'Parsing BugZilla changes...',
@@ -298,7 +301,7 @@ def scan(KibbleBit, source):
         'good': True
     }
     KibbleBit.updateSource(source)
-    
+
     bz = re.match(r"(https?://\S+?)(/jsonrpc\.cgi)?[\s:?]+(.+)", url)
     if bz:
         if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
@@ -313,10 +316,10 @@ def scan(KibbleBit, source):
         u = "%s/jsonrpc.cgi" % dom
         instance = bz.group(3)
         lastTicket = 0
-        
+
         params = {
             'product':  [instance],
-            'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"], 
+            'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"],
             'include_fields': ['id', 'creation_time', 'status', 'summary', 'creator'],
             'limit': 10000,
             'offset': 1
@@ -324,12 +327,12 @@ def scan(KibbleBit, source):
         # If * is requested, just omit the product name
         if instance == '*':
             params = {
-                'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"], 
+                'status': ["RESOLVED", "CLOSED", "NEW","UNCOMFIRMED","ASSIGNED","REOPENED","VERIFIED"],
                 'include_fields': ['id', 'creation_time', 'status', 'summary', 'creator'],
                 'limit': 10000,
                 'offset': 1
                 }
-        
+
         ticketsURL = "%s?method=Bug.search&params=[%s]" % (u, urllib.parse.quote(json.dumps(params)))
 
         while True:
@@ -338,7 +341,7 @@ def scan(KibbleBit, source):
             except:
                 KibbleBit.pprint("Couldn't fetch more tickets, bailing")
                 break
-            
+
             if len(js['result']['bugs']) > 0:
                 KibbleBit.pprint("%s: Found %u tickets..." % (source['sourceURL'], ((params.get('offset', 1)-1) + len(js['result']['bugs']))))
                 for bug in js['result']['bugs']:
@@ -350,7 +353,7 @@ def scan(KibbleBit, source):
             else:
                 KibbleBit.pprint("No more tickets left to scan")
                 break
-            
+
         KibbleBit.pprint("Found %u open tickets, %u closed." % (len(openTickets), len(pendingTickets) - len(openTickets)))
 
         badOnes = 0
@@ -360,10 +363,10 @@ def scan(KibbleBit, source):
             t = bzThread(KibbleBit, source, block, pendingTickets, openTickets, u, dom)
             threads.append(t)
             t.start()
-        
+
         for t in threads:
             t.join()
-        
+
 
         source['steps']['issues'] = {
             'time': time.time(),
diff --git a/src/plugins/scanners/buildbot.py b/src/plugins/scanners/buildbot.py
index b99f5bf..566e86c 100644
--- a/src/plugins/scanners/buildbot.py
+++ b/src/plugins/scanners/buildbot.py
@@ -30,7 +30,7 @@
 """
 
 title = "Scanner for Buildbot"
-version = "0.1.0"
+version = "0.1.1"
 
 def accepts(source):
     """ Determines whether we want to handle this source """
@@ -41,17 +41,17 @@ def accepts(source):
 
 def scanJob(KibbleBit, source, job, creds):
     """ Scans a single job for activity """
-    NOW = int(datetime.datetime.utcnow().timestamp())
+    NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
     dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceID'], job) ).encode('ascii', errors='replace')).hexdigest()
     found = True
     doc= None
     parseIt = False
     found = KibbleBit.exists('cijob', dhash)
-    
+
     jobURL = "%s/api/v2/builders/%s/builds" % (source['sourceURL'], job)
     KibbleBit.pprint(jobURL)
     jobjson = plugins.utils.jsonapi.get(jobURL, auth = creds)
-    
+
     # If valid JSON, ...
     if jobjson:
         for buildno, data in jobjson.items():
@@ -61,16 +61,16 @@ def scanJob(KibbleBit, source, job, creds):
                 builddoc = KibbleBit.get('ci_build', buildhash)
             except:
                 pass
-            
+
             # If this build already completed, no need to parse it again
             if builddoc and builddoc.get('completed', False):
                 continue
-            
+
             KibbleBit.pprint("[%s-%s] This is new or pending, analyzing..." % (job, buildno))
-            
+
             completed = True if 'currentStep' in data else False
-            
-            
+
+
             # Get build status (success, failed, canceled etc)
             status = 'building'
             if 'successful' in data.get('text', []):
@@ -79,7 +79,7 @@ def scanJob(KibbleBit, source, job, creds):
                 status = 'failed'
             if 'exception' in data.get('text', []):
                 status = 'aborted'
-            
+
             DUR = 0
             # Calc when the build finished
             if completed and len(data.get('times', [])) == 2 and data['times'][1]:
@@ -87,7 +87,7 @@ def scanJob(KibbleBit, source, job, creds):
                 DUR = FIN  - data['times'][0]
             else:
                 FIN = 0
-                
+
             doc = {
                 # Build specific data
                 'id': buildhash,
@@ -100,7 +100,7 @@ def scanJob(KibbleBit, source, job, creds):
                 'status': status,
                 'started': int(data['times'][0]),
                 'ci': 'buildbot',
-                
+
                 # Standard docs values
                 'sourceID': source['sourceID'],
                 'organisation': source['organisation'],
@@ -109,7 +109,7 @@ def scanJob(KibbleBit, source, job, creds):
             KibbleBit.append('ci_build', doc)
         # Yay, it worked!
         return True
-    
+
     # Boo, it failed!
     KibbleBit.pprint("Fetching job data failed!")
     return False
@@ -124,7 +124,7 @@ def __init__(self, block, KibbleBit, source, creds, jobs):
         self.creds = creds
         self.source = source
         self.jobs = jobs
-        
+
     def run(self):
         badOnes = 0
         while len(self.jobs) > 0 and badOnes <= 50:
@@ -158,7 +158,9 @@ def scan(KibbleBit, source):
     # Simple URL check
     buildbot = re.match(r"(https?://.+)", source['sourceURL'])
     if buildbot:
-        
+        if not 'steps' in source:
+            source['steps'] = {}
+
         source['steps']['ci'] = {
             'time': time.time(),
             'status': 'Parsing Buildbot job changes...',
@@ -166,7 +168,7 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         badOnes = 0
         pendingJobs = []
         KibbleBit.pprint("Parsing Buildbot activity at %s" % source['sourceURL'])
@@ -177,22 +179,22 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
-        # Buildbot may neeed credentials
+
+        # Buildbot may need credentials
         creds = None
         if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
             creds = "%s:%s" % (source['creds']['username'], source['creds']['password'])
-            
+
         # Get the job list
         sURL = source['sourceURL']
         KibbleBit.pprint("Getting job list...")
         builders = plugins.utils.jsonapi.get("%s/api/v2/builders" % sURL , auth = creds)
-        
+
         # Save queue snapshot
-        NOW = int(datetime.datetime.utcnow().timestamp())
+        NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
         queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceID'], int(time.time())) ).encode('ascii', errors='replace')).hexdigest()
-        
-        
+
+
         # Scan queue items
         blocked = 0
         stuck = 0
@@ -202,7 +204,7 @@ def scan(KibbleBit, source):
         actualQueueSize = 0
         building = 0
         jobs = []
-        
+
         for builder, data in builders.items():
             jobs.append(builder)
             if data['state'] == 'building':
@@ -217,8 +219,8 @@ def scan(KibbleBit, source):
                 # Stuck builds (iow no builder available)
                 if data['state'] == 'offline':
                     stuck += data.get('pendingBuilds', 0)
-            
-        
+
+
         # Write up a queue doc
         queuedoc = {
             'id': queuehash,
@@ -229,16 +231,16 @@ def scan(KibbleBit, source):
             'stuck': stuck,
             'building': building,
             'ci': 'buildbot',
-            
+
             # Standard docs values
             'sourceID': source['sourceID'],
             'organisation': source['organisation'],
             'upsert': True,
         }
         KibbleBit.append('ci_queue', queuedoc)
-        
+
         KibbleBit.pprint("Found %u builders in Buildbot" % len(jobs))
-        
+
         threads = []
         block = threading.Lock()
         KibbleBit.pprint("Scanning jobs using 4 sub-threads")
@@ -246,11 +248,11 @@ def scan(KibbleBit, source):
             t = buildbotThread(block, KibbleBit, source, creds, jobs)
             threads.append(t)
             t.start()
-        
+
         for t in threads:
             t.join()
 
-        # We're all done, yaay        
+        # We're all done, yaay
         KibbleBit.pprint("Done scanning %s" % source['sourceURL'])
 
         source['steps']['ci'] = {
@@ -260,4 +262,3 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-    
\ No newline at end of file
diff --git a/src/plugins/scanners/discourse.py b/src/plugins/scanners/discourse.py
index d160baa..9ecae03 100644
--- a/src/plugins/scanners/discourse.py
+++ b/src/plugins/scanners/discourse.py
@@ -30,7 +30,7 @@
 """
 
 title = "Scanner for Discourse Forums"
-version = "0.1.0"
+version = "0.1.1"
 
 def accepts(source):
     """ Determines whether we want to handle this source """
@@ -41,30 +41,30 @@ def accepts(source):
 
 def scanJob(KibbleBit, source, cat, creds):
     """ Scans a single discourse category for activity """
-    NOW = int(datetime.datetime.utcnow().timestamp())
-    
+    NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+
     # Get $discourseURL/c/$catID
-    
+
     catURL = os.path.join(source['sourceURL'], "c/%s" % cat['id'])
     KibbleBit.pprint("Scanning Discourse category '%s' at %s" % (cat['slug'], catURL))
-    
+
     page = 0
     allUsers = {}
-    
+
     # For each paginated result (up to page 100), check for changes
     while page < 100:
         pcatURL = "%s?page=%u" % (catURL, page)
         catjson = plugins.utils.jsonapi.get(pcatURL, auth = creds)
         page += 1
-    
-        
+
+
         if catjson:
-            
+
             # If we hit an empty list (no more topics), just break the loop.
             if not catjson['topic_list']['topics']:
                 break
-            
-            # First (if we have data), we should store the known users       
+
+            # First (if we have data), we should store the known users
             # Since discourse hides the email (obviously!), we'll have to
             # fake one to generate an account.
             fakeDomain = "foo.discourse"
@@ -75,7 +75,7 @@ def scanJob(KibbleBit, source, cat, creds):
                 # Fake email address, compute deterministic ID
                 email = "%s@%s" % (user['username'], fakeDomain)
                 dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
-                
+
                 # Construct a very sparse user document
                 userDoc = {
                     'id': dhash,
@@ -83,29 +83,29 @@ def scanJob(KibbleBit, source, cat, creds):
                     'name': user['username'],
                     'email': email,
                 }
-                
+
                 # Store user-ID-to-username mapping for later
                 allUsers[user['id']] = userDoc
-                
+
                 # Store it (or, queue storage) unless it exists.
                 # We don't wanna override better data, so we check if
                 # it's there first.
                 if not KibbleBit.exists('person', dhash):
                     KibbleBit.append('person', userDoc)
-            
+
             # Now, for each topic, we'll store a topic document
             for topic in catjson['topic_list']['topics']:
-                
+
                 # Calculate topic ID
                 dhash = hashlib.sha224( ("%s-%s-topic-%s" % (source['organisation'], source['sourceURL'], topic['id']) ).encode('ascii', errors='replace')).hexdigest()
-                
+
                 # Figure out when topic was created and updated
                 CreatedDate = datetime.datetime.strptime(topic['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
                 if topic.get('last_posted_at'):
                     UpdatedDate = datetime.datetime.strptime(topic['last_posted_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
                 else:
                     UpdatedDate = 0
-                
+
                 # Determine whether we should scan this topic or continue to the next one.
                 # We'll do this by seeing if the topic already exists and has no changes or not.
                 if KibbleBit.exists('forum_topic', dhash):
@@ -113,14 +113,14 @@ def scanJob(KibbleBit, source, cat, creds):
                     # If update in the old doc was >= current update timestamp, skip the topic
                     if fdoc['updated'] >= UpdatedDate:
                         continue
-                
-                
+
+
                 # Assuming we need to scan this, start by making the base topic document
                 topicdoc = {
                     'id': dhash,
                     'sourceID': source['sourceID'],
                     'organisation': source['organisation'],
-                    
+
                     'type': 'discourse',
                     'category': cat['slug'],
                     'title': topic['title'],
@@ -134,23 +134,23 @@ def scanJob(KibbleBit, source, cat, creds):
                     'views': topic['views'],
                     'url': source['sourceURL'] + "/t/%s/%s" % (topic['slug'], topic['id'])
                 }
-                
+
                 KibbleBit.append('forum_topic', topicdoc)
                 KibbleBit.pprint("%s is new or changed, scanning" % topicdoc['url'])
-                
+
                 # Now grab all the individual replies/posts
                 # Remember to not have it count as a visit!
                 pURL = "%s?track_visit=false&forceLoad=true" % topicdoc['url']
                 pjson = plugins.utils.jsonapi.get(pURL, auth = creds)
-                
+
                 posts = pjson['post_stream']['posts']
-                
+
                 # For each post/reply, construct a forum_entry document
                 KibbleBit.pprint("%s has %u posts" % (pURL, len(posts)))
                 for post in posts:
                     phash = hashlib.sha224( ("%s-%s-post-%s" % (source['organisation'], source['sourceURL'], post['id']) ).encode('ascii', errors='replace')).hexdigest()
                     uname = post.get('name', post['username']) or post['username'] # Hack to get longest non-zero value
-                    
+
                     # Find the hash of the person who posted it
                     # We may know them, or we may have to store them.
                     # If we have better info now (full name), re-store
@@ -160,7 +160,7 @@ def scanJob(KibbleBit, source, cat, creds):
                         # Same as before, fake email, store...
                         email = "%s@%s" % (post['username'], fakeDomain)
                         uhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], email) ).encode('ascii', errors='replace')).hexdigest()
-                        
+
                         # Construct a very sparse user document
                         userDoc = {
                             'id': uhash,
@@ -168,22 +168,22 @@ def scanJob(KibbleBit, source, cat, creds):
                             'name': uname,
                             'email': email,
                         }
-                        
+
                         # Store user-ID-to-username mapping for later
                         allUsers[user['id']] = userDoc
-                        
+
                         # Store it (or, queue storage)
                         KibbleBit.append('person', userDoc)
-                    
+
                     # Get post date
                     CreatedDate = datetime.datetime.strptime(post['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").timestamp()
-                    
+
                     # Store the post/reply document
                     pdoc = {
                         'id': phash,
                         'sourceID': source['sourceID'],
                         'organisation': source['organisation'],
-                        
+
                         'type': 'discourse',
                         'creator': uhash,
                         'created': CreatedDate,
@@ -209,7 +209,7 @@ def __init__(self, block, KibbleBit, source, creds, jobs):
         self.creds = creds
         self.source = source
         self.jobs = jobs
-        
+
     def run(self):
         badOnes = 0
         while len(self.jobs) > 0 and badOnes <= 50:
@@ -243,7 +243,9 @@ def scan(KibbleBit, source):
     # Simple URL check
     discourse = re.match(r"(https?://.+)", source['sourceURL'])
     if discourse:
-        
+        if not 'steps' in source:
+            source['steps'] = {}
+
         source['steps']['forum'] = {
             'time': time.time(),
             'status': 'Parsing Discourse topics...',
@@ -251,7 +253,7 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         badOnes = 0
         pendingJobs = []
         KibbleBit.pprint("Parsing Discourse activity at %s" % source['sourceURL'])
@@ -262,22 +264,22 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         # Discourse may neeed credentials (if basic auth)
         creds = None
         if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
             creds = "%s:%s" % (source['creds']['username'], source['creds']['password'])
-            
+
         # Get the list of categories
         sURL = source['sourceURL']
         KibbleBit.pprint("Getting categories...")
         catjs = plugins.utils.jsonapi.get("%s/categories_and_latest" % sURL , auth = creds)
-        
+
         # Directly assign the category list as pending jobs queue, ezpz.
         pendingJobs = catjs['category_list']['categories']
-        
+
         KibbleBit.pprint("Found %u categories" % len(pendingJobs))
-        
+
         # Now fire off 4 threads to parse the categories
         threads = []
         block = threading.Lock()
@@ -286,11 +288,11 @@ def scan(KibbleBit, source):
             t = discourseThread(block, KibbleBit, source, creds, pendingJobs)
             threads.append(t)
             t.start()
-        
+
         for t in threads:
             t.join()
 
-        # We're all done, yaay        
+        # We're all done, yaay
         KibbleBit.pprint("Done scanning %s" % source['sourceURL'])
 
         source['steps']['forum'] = {
@@ -300,4 +302,3 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-    
\ No newline at end of file
diff --git a/src/plugins/scanners/gerrit.py b/src/plugins/scanners/gerrit.py
index 86dd327..70148a1 100644
--- a/src/plugins/scanners/gerrit.py
+++ b/src/plugins/scanners/gerrit.py
@@ -154,6 +154,9 @@ def status_changed(stored_change, change):
     return stored_change['status'] != change['status']
 
 def scan(KibbleBit, source):
+    if not 'steps' in source:
+        source['steps'] = {}
+
     source['steps']['issues'] = {
         'time': time.time(),
         'status': 'Analyzing Gerrit tickets...',
@@ -226,7 +229,7 @@ def scan(KibbleBit, source):
 
         except requests.HTTPError as e:
             print(e)
-            
+
     source['steps']['issues'] = {
         'time': time.time(),
         'status': 'Done analyzing tickets!',
diff --git a/src/plugins/scanners/git-census.py b/src/plugins/scanners/git-census.py
index f5cc69d..b0327fc 100644
--- a/src/plugins/scanners/git-census.py
+++ b/src/plugins/scanners/git-census.py
@@ -50,8 +50,11 @@ def scan(KibbleBit, source):
     url = source['sourceURL']
     rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
     gpath = os.path.join(rootpath, rid)
-    
-    if 'steps' in source and source['steps']['sync']['good'] and os.path.exists(gpath):
+
+    if not 'steps' in source:
+        source['steps'] = {}
+
+    if source['steps']['sync']['good'] and os.path.exists(gpath):
         source['steps']['census'] = {
                 'time': time.time(),
                 'status': 'Census count started at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
@@ -177,7 +180,7 @@ def scan(KibbleBit, source):
                 # Make a list of changed files, max 1024
                 filelist = list(files_touched)
                 filelist = filelist[:1023]
-                
+
                 # ES commit documents
                 tsd = ts - (ts % 86400)
                 js = {
@@ -222,7 +225,7 @@ def scan(KibbleBit, source):
                     'organisation': source['organisation'],
                     'id' : hashlib.sha1( ("%s%s" % (source['organisation'], ce)).encode('ascii', errors='replace')).hexdigest()
                 })
-                KibbleBit.append ( 'person', 
+                KibbleBit.append ( 'person',
                     {
                     'upsert': True,
                     'name': an,
@@ -234,7 +237,7 @@ def scan(KibbleBit, source):
                     )
                 KibbleBit.append('code_commit', js)
                 KibbleBit.append('code_commit_unique', jsx)
-                
+
         if True: # Do file changes?? Might wanna make this optional
             KibbleBit.pprint("Scanning file changes for %s" % source['sourceURL'])
             for filename in modificationDates:
@@ -257,7 +260,7 @@ def scan(KibbleBit, source):
                     del jsfe['created']
                     del jsfe['createdDate']
                 KibbleBit.append('file_history', jsfe)
-                
+
         source['steps']['census'] = {
                 'time': time.time(),
                 'status': 'Census count completed at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
@@ -266,5 +269,3 @@ def scan(KibbleBit, source):
             }
         source['census'] = time.time()
         KibbleBit.updateSource(source)
-       
-
diff --git a/src/plugins/scanners/git-evolution.py b/src/plugins/scanners/git-evolution.py
index 8ed648c..b9b6ca5 100644
--- a/src/plugins/scanners/git-evolution.py
+++ b/src/plugins/scanners/git-evolution.py
@@ -16,14 +16,13 @@
 # limitations under the License.
 
 """ Git Evolution scanner """
+import importlib
 import os
 import subprocess
-import re
 import time
 import calendar
 import datetime
-import plugins.utils.git
-import plugins.utils.sloc
+
 import hashlib
 from collections import namedtuple
 
@@ -68,7 +67,7 @@ def release(KibbleBit, source, status, exception=None, good=False):
     if exception:
         source['steps']['evolution'].update({'exception': exception})
     KibbleBit.updateSource(source)
-    
+
 
 def check_branch(gpath, date, branch):
     try:
@@ -114,48 +113,48 @@ def find_branch(date, gpath):
 
 
 def scan(KibbleBit, source):
-    
+
     rid = source['sourceID']
     url = source['sourceURL']
     rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
     gpath = os.path.join(rootpath, rid)
-    
+
     gname = source['sourceID']
     KibbleBit.pprint("Doing evolution scan of %s" % gname)
-    
+
     inp = get_first_ref(gpath)
     if inp:
         ts = int(inp.split()[0])
         ts = ts - (ts % 86400)
         date = time.strftime("%Y-%b-%d 0:00", time.gmtime(ts))
-    
+
         #print("Starting from %s" % date)
         now = time.time()
-    
+
         rid = source['sourceID']
         url = source['sourceURL']
         rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
         gpath = os.path.join(rootpath, rid)
-        
+
         if source['steps']['sync']['good'] and os.path.exists(gpath):
             acquire(KibbleBit, source)
             branch = find_branch(date, gpath)
-    
+
             if not branch:
                 release(source, "Could not do evolutionary scan of code",
                         "No default branch was found in this repository")
                 return
-    
+
             branch_exists = check_branch(gpath, date, branch)
-    
+
             if not branch_exists:
                 KibbleBit.pprint("Not trunk either (bad repo?), skipping")
                 release(source, "Could not do evolutionary scan of code",
                         "No default branch was found in this repository")
                 return
-    
+
             try:
-    
+
                 d = time.gmtime(now)
                 year = d[0]
                 quarter = d[1] - (d[1] % 3)
@@ -166,7 +165,7 @@ def scan(KibbleBit, source):
                     pd = datetime.datetime(year, quarter, 1).replace(tzinfo=datetime.timezone.utc).timetuple()
                     date = time.strftime("%Y-%b-%d 0:00", pd)
                     unix =  calendar.timegm(pd)
-    
+
                     # Skip the dates we've already processed
                     dhash = hashlib.sha224((source['sourceID'] + date).encode('ascii',
                                                                         'replace')).hexdigest()
@@ -174,7 +173,8 @@ def scan(KibbleBit, source):
                     if not found:
                         checkout(gpath, date, branch)
                         KibbleBit.pprint("Running cloc on %s (%s) at %s" % (gname, source['sourceURL'], date))
-                        languages, codecount, comment, blank, years, cost = plugins.utils.sloc.count(gpath)
+                        sloc = importlib.import_module("plugins.utils.sloc")
+                        languages, codecount, comment, blank, years, cost = sloc.count(gpath)
                         js = {
                             'time': unix,
                             'sourceID': source['sourceID'],
@@ -192,7 +192,7 @@ def scan(KibbleBit, source):
                     if quarter <= 0:
                         quarter += 12
                         year -= 1
-                        
+
                     # decrease month by 3
                     now = time.mktime(datetime.date(year, quarter, 1).timetuple())
             except Exception as e:
@@ -201,9 +201,7 @@ def scan(KibbleBit, source):
                         time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
                         str(e))
                 return
-    
+
             release(KibbleBit, source, "Evolution scan completed at " +
                     time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
                     good=True)
-
-    
\ No newline at end of file
diff --git a/src/plugins/scanners/git-sloc.py b/src/plugins/scanners/git-sloc.py
index e2294f7..0b202ce 100644
--- a/src/plugins/scanners/git-sloc.py
+++ b/src/plugins/scanners/git-sloc.py
@@ -16,19 +16,17 @@
 # limitations under the License.
 
 """ Source Lines of Code counter for Git """
-
-
+import importlib
 import os
 import sys
 import subprocess
 import time
 import shutil
-import plugins.utils.git
-import plugins.utils.sloc
+
 import re
 
 title = "SloC Counter for Git"
-version = "0.1.0"
+version = "0.1.1"
 
 def accepts(source):
     """ Do we accept this source? """
@@ -40,12 +38,15 @@ def accepts(source):
     return False
 
 def scan(KibbleBit, source):
-    
+
     rid = source['sourceID']
     url = source['sourceURL']
     rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
     gpath = os.path.join(rootpath, rid)
-    
+
+    if not 'steps' in source:
+        source['steps'] = {}
+
     if source['steps']['sync']['good'] and os.path.exists(gpath):
         source['steps']['count'] = {
                 'time': time.time(),
@@ -54,17 +55,19 @@ def scan(KibbleBit, source):
                 'good': True,
             }
         KibbleBit.updateSource(source)
-        
+
+        git = importlib.import_module("plugins.utils.git")
         try:
-            branch = plugins.utils.git.defaultBranch(source, gpath)
+            branch = git.defaultBranch(source, gpath)
             subprocess.call('cd %s && git checkout %s' % (gpath, branch), shell = True)
         except:
             KibbleBit.pprint("SLoC counter failed to find main branch for %s!!" % url)
             return False
-        
+
         KibbleBit.pprint("Running SLoC count for %s" % url)
-        languages, codecount, comment, blank, years, cost = plugins.utils.sloc.count(gpath)
-        
+        sloc = importlib.import_module("plugins.utils.sloc")
+        languages, codecount, comment, blank, years, cost = sloc.count(gpath)
+
         sloc = {
             'sourceID': source['sourceID'],
             'loc': codecount,
diff --git a/src/plugins/scanners/git-sync.py b/src/plugins/scanners/git-sync.py
index 234e33b..64bec50 100644
--- a/src/plugins/scanners/git-sync.py
+++ b/src/plugins/scanners/git-sync.py
@@ -15,16 +15,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import os
 import sys
 import subprocess
 import time
 import shutil
-import plugins.utils.git
+
+import plugins.utils.git as git
 
 title = "Sync plugin for Git repositories"
-version = "0.1.2"
+version = "0.1.3"
 
 def accepts(source):
     """ Do we accept this source? """
@@ -34,15 +34,15 @@ def accepts(source):
     if source['type'] == 'github' and source.get('issuesonly', False) == False:
         return True
     return False
-    
+
 def scan(KibbleBit, source):
-    
+
     #KibbleBit.pprint("Scan source: %s." % source)
     # Get some vars, construct a data path for the repo
     path = source['sourceID']
     url = source['sourceURL']
     rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
-    
+
     # If the root path does not exist, try to make it recursively.
     if not os.path.exists(rootpath):
         try:
@@ -57,14 +57,14 @@ def scan(KibbleBit, source):
             }
             KibbleBit.updateSource(source)
             return
-    
+
     # This is were the repo should be cloned
     datapath = os.path.join(rootpath, path)
-    
+
     KibbleBit.pprint("Checking out %s as %s" % (url, path))
 
     try:
-        if 'steps' not in source: # initial fetch of a github repo may miss steps 
+        if 'steps' not in source: # initial fetch of a github repo may miss steps
             source['steps'] = {}
         source['steps']['sync'] = {
             'time': time.time(),
@@ -73,13 +73,13 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         # If we already checked this out earlier, just sync it.
         if os.path.exists(datapath):
             KibbleBit.pprint("Repo %s exists, fetching changes..." % datapath)
-            
+
             # Do we have a default branch here?
-            branch = plugins.utils.git.defaultBranch(source, datapath, KibbleBit)
+            branch = git.defaultBranch(source, datapath, KibbleBit)
             if len(branch) == 0:
                 source['default_branch'] = branch
                 source['steps']['sync'] = {
@@ -113,7 +113,7 @@ def scan(KibbleBit, source):
                         fcommit = fcommit.decode('ascii').strip()
                         subprocess.check_call("cd %s && git reset --hard %s" % (datapath, fcommit), shell = True, stderr=subprocess.STDOUT)
                         try:
-                            subprocess.check_call("cd %s && git clean -xfd" % datpath, shell = True, stderr=subprocess.STDOUT)
+                            subprocess.check_call("cd %s && git clean -xfd" % datapath, shell = True, stderr=subprocess.STDOUT)
                         except:
                             pass
         # This is a new repo, clone it!
@@ -133,7 +133,7 @@ def scan(KibbleBit, source):
         }
         KibbleBit.updateSource(source)
         return
-    
+
     # All good, yay!
     source['steps']['sync'] = {
             'time': time.time(),
@@ -142,4 +142,3 @@ def scan(KibbleBit, source):
             'good': True
         }
     KibbleBit.updateSource(source)
-    
diff --git a/src/plugins/scanners/github-issues.py b/src/plugins/scanners/github-issues.py
index bea1180..f6737c4 100644
--- a/src/plugins/scanners/github-issues.py
+++ b/src/plugins/scanners/github-issues.py
@@ -14,13 +14,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import importlib
 import re
 import hashlib
 from dateutil import parser
 import time
 import requests
-import plugins.utils.github
 
 title = "Scanner for GitHub Issues"
 version = "0.1.0"
@@ -121,12 +120,14 @@ def update_person(KibbleBit, person):
 def scan(KibbleBit, source, firstAttempt = True):
     auth=None
     people = {}
+    github = importlib.import_module("plugins.utils.github")
+
     if 'creds' in source:
         KibbleBit.pprint("Using auth for repo %s" % source['sourceURL'])
         creds = source['creds']
         if creds and 'username' in creds:
             auth = (creds['username'], creds['password'])
-    TL = plugins.utils.github.get_tokens_left(auth=auth)
+    TL = github.get_tokens_left(auth=auth)
     KibbleBit.pprint("Scanning for GitHub issues (%u tokens left on GitHub)" % TL)
     # Have we scanned before? If so, only do a 3 month scan here.
     doneBefore = False
@@ -143,11 +144,11 @@ def scan(KibbleBit, source, firstAttempt = True):
         if doneBefore:
             since = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(time.time() - (3*30*86400)))
             KibbleBit.pprint("Fetching changes since %s" % since)
-            issues = plugins.utils.github.get_all(source, plugins.utils.github.issues,
+            issues = github.get_all(source, github.issues,
                                    params={'filter': 'all', 'state':'all', 'since': since},
                                    auth=auth)
         else:
-            issues = plugins.utils.github.get_all(source, plugins.utils.github.issues,
+            issues = github.get_all(source, github.issues,
                                    params={'filter': 'all', 'state':'all'},
                                    auth=auth)
         KibbleBit.pprint("Fetched %s issues for %s" %(str(len(issues)), source['sourceURL']))
@@ -155,14 +156,14 @@ def scan(KibbleBit, source, firstAttempt = True):
         for issue in issues:
 
             if not issue['user']['login'] in people:
-                person = make_person(source, issue, plugins.utils.github.user(issue['user']['url'],
+                person = make_person(source, issue, github.user(issue['user']['url'],
                                                           auth=auth))
                 people[issue['user']['login']] = person
                 update_person(KibbleBit, person)
 
             #KibbleBit.pprint("issue: %s" % issue )
             if 'closed_by' in issue and issue['closed_by'] is not None and not issue['closed_by']['login'] in people:
-                closer = make_person(source, issue, plugins.utils.github.user(issue['closed_by']['url'],
+                closer = make_person(source, issue, github.user(issue['closed_by']['url'],
                                                           auth=auth))
                 people[issue['closed_by']['login']] = closer
                 update_person(KibbleBit, closer)
@@ -172,7 +173,10 @@ def scan(KibbleBit, source, firstAttempt = True):
             stored_change = None
             if KibbleBit.exists('issue', dhash):
                 es_doc = KibbleBit.get('issue', dhash)
-                if not status_changed(es_doc, doc):
+                if 'doc' in es_doc:
+                    es_doc = es_doc['doc']
+                #KibbleBit.pprint("status %s seen %s." % ('status' in es_doc, 'status' in doc))
+                if 'status' in es_doc and 'status' in doc and not status_changed(es_doc, doc):
                     #KibbleBit.pprint("change %s seen already and status unchanged. Skipping." % issue['id'])
                     continue
 
@@ -190,16 +194,16 @@ def scan(KibbleBit, source, firstAttempt = True):
         # If we errored out because of rate limiting, retry later, otherwise bail
         if firstAttempt:
             sleeps = 0
-            if plugins.utils.github.get_tokens_left(auth=auth) < 10:
+            if github.get_tokens_left(auth=auth) < 10:
                 KibbleBit.pprint("Hit rate limits, trying to sleep it off!")
-                while plugins.utils.github.get_tokens_left(auth=auth) < 10:
+                while github.get_tokens_left(auth=auth) < 10:
                     sleeps += 1
                     if sleeps > 24:
                         KibbleBit.pprint("Slept for too long without finding a reset rate limit, giving up!")
                         break
                     time.sleep(300) # Sleep 5 min, then check again..
                 # If we have tokens, try one more time...
-                if plugins.utils.github.get_tokens_left(auth=auth) > 10:
+                if github.get_tokens_left(auth=auth) > 10:
                     scan(KibbleBit, source, False) # If this one fails, bail completely
                     return
 
diff --git a/src/plugins/scanners/github-stats.py b/src/plugins/scanners/github-stats.py
index 4ac933c..3c8d9f4 100644
--- a/src/plugins/scanners/github-stats.py
+++ b/src/plugins/scanners/github-stats.py
@@ -14,14 +14,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
+import hashlib
+import importlib
 import os
+import re
 import sys
 import subprocess
 import time
 import shutil
-import plugins.utils.git
+
+from src.plugins.brokers.kibbleES import KibbleBit
 
 title = "Traffic statistics plugin for GitHub repositories"
 version = "0.1.0"
@@ -31,17 +33,17 @@ def accepts(source):
     if source['type'] == 'github':
         return True
     return False
-    
+
 def getTime(string):
     """ Convert GitHub timestamp to epoch """
     return time.mktime(time.strptime(re.sub(r"Z", "", str(string)), "%Y-%m-%dT%H:%M:%S"))
 
 def scan(KibbletBit, source):
-    
+
     # Get some vars, construct a data path for the repo
     path = source['sourceID']
     url = source['sourceURL']
-    
+
     auth=None
     people = {}
     if 'creds' in source:
@@ -60,12 +62,14 @@ def scan(KibbletBit, source):
             'good': True
         }
         KibbletBit.updateSource(source)
-        
+
         # Get views
-        views = plugins.utils.github.views(url, auth)
+        github = importlib.import_module("plugins.utils.github")
+        views = github.views(url, auth)
         if 'views' in views:
             for el in views['views']:
                 ts = getTime(el['timestamp'])
+                #print("reformatted time:", ts)
                 shash = hashlib.sha224( ("%s-%s-%s-clones" %(source['organisation'], url, el['timestamp'])).encode('ascii', errors = 'replace')).hexdigest()
                 bit = {
                     'organisation': source['organisation'],
@@ -78,9 +82,9 @@ def scan(KibbletBit, source):
                     'id': shash
                 }
                 KibbleBit.append('ghstats', bit)
-                
+
         # Get clones
-        clones = plugins.utils.github.clones(url, auth)
+        clones = github.clones(url, auth)
         if 'clones' in clones:
             for el in clones['clones']:
                 ts = getTime(el['timestamp'])
@@ -96,12 +100,12 @@ def scan(KibbletBit, source):
                     'id': shash
                 }
                 KibbleBit.append('ghstats', bit)
-                
+
         # Get referrers
-        refs = plugins.utils.github.referrers(url, auth)
+        refs = github.referrers(url, auth)
         if refs:
             for el in refs:
-                el['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S", time.time())
+                el['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S", time)
                 ts = getTime(el['timestamp'])
                 shash = hashlib.sha224( ("%s-%s-%s-refs" %(source['organisation'], url, el['timestamp'])).encode('ascii', errors = 'replace')).hexdigest()
                 bit = {
@@ -118,4 +122,3 @@ def scan(KibbletBit, source):
     except:
         pass
         # All done!
-        
\ No newline at end of file
diff --git a/src/plugins/scanners/jira.py b/src/plugins/scanners/jira.py
index 58a8daa..000c05b 100644
--- a/src/plugins/scanners/jira.py
+++ b/src/plugins/scanners/jira.py
@@ -14,13 +14,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import importlib
 import time
 import datetime
 import re
 import json
 import hashlib
-import plugins.utils.jsonapi
 import threading
 import requests.exceptions
 
@@ -36,7 +35,7 @@ def accepts(source):
     if source['type'] == 'jira':
         return True
     if source['type'] == "issuetracker":
-        jira = re.match(r"(https?://.+)/browse/([A-Z0-9]+)", url)
+        jira = re.match(r"(https?://.+)/browse/([A-Z0-9]+)", source)
         if jira:
             return True
     return False
@@ -106,12 +105,12 @@ def pchange(js):
 
 def scanTicket(KibbleBit, key, u, source, creds, openTickets):
     """ Scans a single ticket for activity and people """
-    
+
     dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], key) ).encode('ascii', errors='replace')).hexdigest()
     found = True
     doc= None
     parseIt = False
-    
+
     # the 'domain' var we try to figure out here is used
     # for faking email addresses and keep them unique,
     # in case JIRA has email visibility turned off.
@@ -119,7 +118,7 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets):
     m = re.search(r"https?://([^/]+)", u)
     if m:
         domain = m.group(1)
-    
+
     found = KibbleBit.exists('issue', dhash)
     if not found:
         KibbleBit.pprint("[%s] We've never seen this ticket before, parsing..." % key)
@@ -139,13 +138,14 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets):
                 KibbleBit.pprint("[%s] Ticket contains erroneous data from a previous scan, reparsing" % key)
             # This is just noise!
             #KibbleBit.pprint("[%s] Ticket hasn't changed, ignoring..." % key)
-    
+
     if parseIt:
         KibbleBit.pprint("[%s] Parsing data from JIRA at %s..." % (key, domain))
         queryURL = "%s/rest/api/2/issue/%s?fields=creator,reporter,status,issuetype,summary,assignee,resolutiondate,created,priority,changelog,comment,resolution,votes&expand=changelog" % (u, key)
         jiraURL = "%s/browse/%s" % (u, key)
+        jsonapi = importlib.import_module("plugins.utils.jsonapi")
         try:
-            tjson = plugins.utils.jsonapi.get(queryURL, auth = creds)
+            tjson = jsonapi.get(queryURL, auth = creds)
             if not tjson:
                 KibbleBit.pprint("%s does not exist (404'ed)" % key)
                 return False
@@ -157,12 +157,12 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets):
             KibbleBit.pprint("Closed but no closer??")
         closerEmail = None
         status = 'closed' if st else 'open'
-        
+
         # Make sure we actually have field data to work with
         if not tjson.get('fields') or not tjson['fields'].get('created'):
             KibbleBit.pprint("[%s] JIRA response is missing field data, ignoring ticket." % key)
             return False
-            
+
         cd = getTime(tjson['fields']['created'])
         rd = getTime(tjson['fields']['resolutiondate']) if 'resolutiondate' in tjson['fields'] and tjson['fields']['resolutiondate'] else None
         comments = 0
@@ -190,7 +190,7 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets):
                     'upsert': True
                 }
                 KibbleBit.append('person', jsp)
-            
+
         if creator:
             creator = creator.replace(" dot ", ".", 10).replace(" at ", "@", 1)
             if not '@' in creator:
@@ -219,7 +219,7 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets):
             'created': cd,
             'closed': rd,
             'issuetype': 'issue',
-            'issueCloser': closerEmail, 
+            'issueCloser': closerEmail,
             'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(cd)),
             'closedDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd)) if rd else None,
             'changeDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(rd if rd else cd)),
@@ -234,8 +234,8 @@ def scanTicket(KibbleBit, key, u, source, creds, openTickets):
     #except Exception as err:
         #KibbleBit.pprint(err)
         #return False
-        
-    
+
+
 
 class jiraThread(threading.Thread):
 
@@ -247,7 +247,7 @@ def __init__(self, block, KibbleBit, source, creds, pt, ot):
         self.source = source
         self.pendingTickets = pt
         self.openTickets = ot
-        
+
     def run(self):
         badOnes = 0
         while len(self.pendingTickets) > 0 and badOnes <= 50:
@@ -281,13 +281,17 @@ def run(self):
 def scan(KibbleBit, source):
     jira = re.match(r"(https?://.+)/browse/([A-Z0-9]+)", source['sourceURL'])
     if jira:
-        
+
+        if not 'steps' in source:
+            source['steps'] = {}
+            #print("issue source %s" % source )
         # JIRA NEEDS credentials to do a proper scan!
         creds = None
-        if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
+        if 'creds' in source and source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
             creds = "%s:%s" % (source['creds']['username'], source['creds']['password'])
         if not creds:
             KibbleBit.pprint("JIRA at %s requires authentication, but none was found! Bailing." % source['sourceURL'])
+
             source['steps']['issues'] = {
                 'time': time.time(),
                 'status': 'JIRA endpoint requires auth, but none was provided!',
@@ -296,7 +300,7 @@ def scan(KibbleBit, source):
             }
             KibbleBit.updateSource(source)
             return
-        
+
         source['steps']['issues'] = {
             'time': time.time(),
             'status': 'Parsing JIRA changes...',
@@ -304,7 +308,7 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         badOnes = 0
         jsa = []
         jsp = []
@@ -317,16 +321,17 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         # Get base URL, list and domain to parse
         u = jira.group(1)
         instance = jira.group(2)
         lastTicket = 0
         latestURL = "%s/rest/api/2/search?jql=project=%s+order+by+createdDate+DESC&fields=id,key&maxResults=1" % (u, instance)
         js = None
-        
+
+        jsonapi = importlib.import_module("plugins.utils.jsonapi")
         try:
-            js = plugins.utils.jsonapi.get(latestURL, auth = creds)
+            js = jsonapi.get(latestURL, auth = creds)
         except requests.exceptions.ConnectionError as err:
             KibbleBit.pprint("Connection error, skipping this ticket for now!")
             source['steps']['issues'] = {
@@ -342,8 +347,8 @@ def scan(KibbleBit, source):
             m = re.search(r"-(\d+)$", key)
             if m:
                 lastTicket = int(m.group(1))
-        
-        
+
+
         openTickets = []
         startAt = 0
         badTries = 0
@@ -351,7 +356,7 @@ def scan(KibbleBit, source):
             openURL = "%s/rest/api/2/search?jql=project=%s+and+status=open+order+by+createdDate+ASC&fields=id,key&maxResults=100&startAt=%u" % (u, instance, startAt)
             #print(openURL)
             try:
-                ojs = plugins.utils.jsonapi.get(openURL, auth = creds)
+                ojs = jsonapi.get(openURL, auth = creds)
                 if not 'issues' in ojs or len(ojs['issues']) == 0:
                     break
                 for item in ojs['issues']:
@@ -362,12 +367,12 @@ def scan(KibbleBit, source):
                 KibbleBit.pprint("JIRA borked, retrying")
                 badTries += 1
         KibbleBit.pprint("Found %u open tickets" % len(openTickets))
-        
+
         badOnes = 0
         for i in reversed(range(1,lastTicket+1)):
             key = "%s-%u" % (instance, i)
             pendingTickets.append([key, u, source])
-        
+
         threads = []
         block = threading.Lock()
         KibbleBit.pprint("Scanning tickets using 4 sub-threads")
@@ -375,10 +380,10 @@ def scan(KibbleBit, source):
             t = jiraThread(block, KibbleBit, source, creds, pendingTickets, openTickets)
             threads.append(t)
             t.start()
-        
+
         for t in threads:
             t.join()
-        
+
         KibbleBit.pprint("Done scanning %s" % source['sourceURL'])
 
         source['steps']['issues'] = {
@@ -388,4 +393,3 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-    
diff --git a/src/plugins/scanners/ponymail.py b/src/plugins/scanners/ponymail.py
index abe8fc1..fd9bca2 100644
--- a/src/plugins/scanners/ponymail.py
+++ b/src/plugins/scanners/ponymail.py
@@ -35,12 +35,12 @@ def accepts(source):
     # If the source equals the plugin name, assume a yes
     if source['type'] == 'ponymail':
         return True
-    
+
     # If it's of type 'mail', check the URL
     if source['type'] == 'mail':
         if re.match(r"(https?://.+)/list\.html\?(.+)@(.+)", source['sourceURL']):
             return True
-    
+
     # Default to not recognizing the source
     return False
 
@@ -86,9 +86,12 @@ def scan(KibbleBit, source):
         }
         KibbleBit.updateSource(source)
         return
-    
+
     # Pony Mail requires a UI cookie in order to work. Maked sure we have one!
     cookie = None
+
+    if not 'steps' in source:
+        source['steps'] = {}
     if 'creds' in source and source['creds']:
         cookie = source['creds'].get('cookie', None)
     if not cookie:
@@ -101,7 +104,7 @@ def scan(KibbleBit, source):
         }
         KibbleBit.updateSource(source)
         return
-    
+
     # Notify scanner and DB that this is valid and we've begun parsing
     KibbleBit.pprint("%s is a valid Pony Mail address, parsing" % source['sourceURL'])
     source['steps']['mail'] = {
@@ -111,13 +114,13 @@ def scan(KibbleBit, source):
         'good': True
     }
     KibbleBit.updateSource(source)
-    
-    
+
+
     # Get base URL, list and domain to parse
     u = url.group(1)
     l = url.group(2)
     d = url.group(3)
-    
+
     # Get this month
     dt = time.gmtime(time.time())
     firstYear = 1970
@@ -127,15 +130,15 @@ def scan(KibbleBit, source):
         month += 12
         year -= 1
     months = 0
-    
+
     # Hash for keeping records of who we know
     knowns = {}
-    
+
     # While we have older archives, continue to parse
     while firstYear <= year:
         statsurl = "%s/api/stats.lua?list=%s&domain=%s&d=%s" % (u, l, d, "%04u-%02u" % (year, month))
         dhash = hashlib.sha224((("%s %s") % (source['organisation'], statsurl)).encode('ascii', errors='replace')).hexdigest()
-        found = False        
+        found = False
         if KibbleBit.exists('mailstats', dhash):
             found = True
         if months <= 1 or not found: # Always parse this month's stats :)
@@ -147,7 +150,7 @@ def scan(KibbleBit, source):
                 js = plugins.utils.jsonapi.get(statsurl, cookie = cookie)
             except Exception as err:
                 KibbleBit.pprint("Server error, skipping this month")
-                month -= 1            
+                month -= 1
                 if month <= 0:
                     month += 12
                     year -= 1
@@ -196,7 +199,7 @@ def scan(KibbleBit, source):
                     'id': mlhash
                 }
                 KibbleBit.index('mailtop', mlhash, jst)
-                
+
             for email in js['emails']:
                 sender = email['from']
                 name = sender
@@ -214,7 +217,7 @@ def scan(KibbleBit, source):
                     if KibbleBit.exists('person',sid):
                         knowns[sender] = True
                 if not sender in knowns or name != sender:
-                    KibbleBit.append('person', 
+                    KibbleBit.append('person',
                         {
                         'upsert': True,
                         'name': name,
@@ -246,8 +249,8 @@ def scan(KibbleBit, source):
                 KibbleBit.append('email', jse)
             for sender in posters:
                 no_posters += 1
-            
-            
+
+
             jso = {
                 'organisation': source['organisation'],
                 'sourceURL': source['sourceURL'],
@@ -259,16 +262,16 @@ def scan(KibbleBit, source):
             }
             #print("Indexing as %s" % dhash)
             KibbleBit.index('mailstats', dhash, jso)
-        month -= 1            
+        month -= 1
         if month <= 0:
             month += 12
             year -= 1
-        
-    
+
+
     source['steps']['mail'] = {
         'time': time.time(),
         'status': 'Mail archives successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
         'running': False,
         'good': True
     }
-    KibbleBit.updateSource(source)
\ No newline at end of file
+    KibbleBit.updateSource(source)
diff --git a/src/plugins/utils/git.py b/src/plugins/utils/git.py
index 8576ce5..3bea67a 100644
--- a/src/plugins/utils/git.py
+++ b/src/plugins/utils/git.py
@@ -17,8 +17,6 @@
 
 """ This is the Kibble git utility plugin """
 
-import os
-import sys
 import subprocess
 import re
 
@@ -30,7 +28,7 @@ def defaultBranch(source, datapath, KibbleBit = None):
     if KibbleBit and KibbleBit.config.get('git'):
         wanted_branches = KibbleBit.config['git'].get('wanted_branches', wanted_branches)
     foundBranch = False
-    
+
     # For each wanted branch, in order, look for it in our clone,
     # and return the name if found.
     for B in wanted_branches:
diff --git a/src/plugins/utils/urlmisc.py b/src/plugins/utils/urlmisc.py
index 9e75a4b..3340bda 100644
--- a/src/plugins/utils/urlmisc.py
+++ b/src/plugins/utils/urlmisc.py
@@ -18,6 +18,7 @@
 """
 This is a Kibble miscellaneous URL functions plugin.
 """
+import base64
 import urllib.request
 import gzip
 import tempfile
@@ -53,9 +54,8 @@ def unzip(url, creds = None, cookie = None):
         if err.code != 404 and err.code != 401:
             tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False)
             subprocess.check_call(("/usr/bin/wget", "-O", tmpfile.name, url))
-            
+
             try:
-                te
                 compressedFile = open("/tmp/kibbletmp.gz", 'rb')
                 if (compressedFile.read(2) == '\x1f\x8b'):
                     compressedFile.seek(0)
@@ -72,4 +72,4 @@ def unzip(url, creds = None, cookie = None):
         tmpfile.flush()
         tmpfile.close()
         return tmpfile.name
-    return None
\ No newline at end of file
+    return None

From c043d6784a994d7e3233ca22b5006cbb041efa33 Mon Sep 17 00:00:00 2001
From: Georg Kallidis <gk@apache.org>
Date: Mon, 2 Dec 2024 15:28:48 +0100
Subject: [PATCH 8/9] kibble-scanner should be run in modules (src) folder
 according to python practice, add some (commented) print statements for
 debugging, increase version from 0.10. to 0.2.0, fix deprecated utcnow to
 datetime.timezone.utc, add filter feature for jenkins to allow to check a
 single job in kibble-scanner.py;

---
 README.md                       |   2 +-
 src/kibble-scanner.py           |  45 ++++++-----
 src/plugins/scanners/jenkins.py | 139 ++++++++++++++++++++------------
 src/plugins/scanners/travis.py  |  77 +++++++++---------
 src/plugins/utils/jsonapi.py    |   2 +-
 5 files changed, 155 insertions(+), 110 deletions(-)

diff --git a/README.md b/README.md
index 8e85410..21cea0c 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ The Kibble Scanners collect information for the Kibble Suite.
 
 ## How to run:
 
- - On a daily/weekly/whatever basis, run: `python3 src/kibble-scanner.py`.
+ - On a daily/weekly/whatever basis, run in folder src: `python3 kibble-scanner.py`.
 
 ### Command line options:
 
diff --git a/src/kibble-scanner.py b/src/kibble-scanner.py
index ac43477..7b441bd 100644
--- a/src/kibble-scanner.py
+++ b/src/kibble-scanner.py
@@ -28,8 +28,8 @@
 import plugins.brokers.kibbleES
 #import plugins.kibbleJSON
 
-VERSION = "0.1.0"
-CONFIG_FILE = "conf/config.yaml"
+VERSION = "0.2.0"
+CONFIG_FILE = "../conf/config.yaml"
 PENDING_OBJECTS = []
 BIG_LOCK = threading.Lock()
 
@@ -43,15 +43,16 @@ def base_parser():
     arg_parser.add_argument("-t", "--type", help="Specific type of scanner to run (default is run all scanners)")
     arg_parser.add_argument("-e", "--exclude", nargs = '+', help="Specific type of scanner(s) to exclude")
     arg_parser.add_argument("-v", "--view", help="Specific source view to scan (default is scan all sources)")
+    arg_parser.add_argument("-j", "--filter", nargs='+', help="Jenkins-only: Filter the list of jobs (e.g. for debugging). To drill down to the target jobs, all nodes to the leaf node(s) are required, e.g --filter <project> <jobgroup> <targetjob1> <targetjob2>. Type is set to jenkins implicitely.")
     return arg_parser
-   
+
 def pprint(string, err = False):
     line = "[core]: %s" % (string)
     if err:
         sys.stderr.write(line + "\n")
     else:
         print(line)
-        
+
 
 def isMine(ID, config):
     if config['scanner'].get('balance',  None):
@@ -65,11 +66,11 @@ def isMine(ID, config):
             return True
         return False
     return True
-    
+
 class scanThread(threading.Thread):
     """ A thread object that grabs an item from the queue and processes
         it, using whatever plugins will come out to play. """
-    def __init__(self, broker, org, i, t = None, e = None):
+    def __init__(self, broker, org, i, t = None, e = None, f= None):
         super(scanThread, self).__init__()
         self.broker = broker
         self.org = org
@@ -77,8 +78,12 @@ def __init__(self, broker, org, i, t = None, e = None):
         self.bit = self.broker.bitClass(self.broker, self.org, i)
         self.stype = t
         self.exclude = e
+        self.filter = f
+        # override
+        if self.filter:
+            self.stype = "jenkins"
         pprint("Initialized thread %i" % i)
-    
+
     def run(self):
         global BIG_LOCK, PENDING_OBJECTS
         time.sleep(0.5) # Primarily to align printouts.
@@ -89,6 +94,7 @@ def run(self):
             try:
                 # Try grabbing an object (might not be any left!)
                 obj = PENDING_OBJECTS.pop(0)
+                #print("object: %s" %(obj))
             except:
                 pass
             BIG_LOCK.release()
@@ -97,14 +103,17 @@ def run(self):
                 if isMine(obj['sourceID'], self.broker.config):
                     # Run through list of scanners in order, apply when useful
                     for sid, scanner in plugins.scanners.enumerate():
-                        
+
                         if scanner.accepts(obj):
                             self.bit.pluginname = "plugins/scanners/" + sid
                             # Excluded scanner type?
                             if self.exclude and sid in self.exclude:
                                 continue
+                            # specific jenkins filter
+                            if self.stype and self.stype == sid and self.filter and sid == "jenkins":
+                                scanner.scan(self.bit, obj, self.filter)
                             # Specific scanner type or no types mentioned?
-                            if not self.stype or self.stype == sid:
+                            elif not self.stype or self.stype == sid:
                                 scanner.scan(self.bit, obj)
             else:
                 break
@@ -115,13 +124,13 @@ def main():
     pprint("Kibble Scanner v/%s starting" % VERSION)
     global CONFIG_FILE, PENDING_OBJECTS
     args = base_parser().parse_args()
-    
+
     # Load config yaml
     if args.config:
         CONFIG_FILE = args.config
     config = yaml.load(open(CONFIG_FILE), Loader=yaml.Loader)
     pprint("Loaded YAML config from %s" % CONFIG_FILE)
-    
+
     # Which broker type do we use here?
     broker = None
     if 'elasticsearch' in config and config['elasticsearch'].get('enabled', False):
@@ -130,14 +139,14 @@ def main():
     else:
         pprint("Using HTTP JSON broker model")
         broker = plugins.brokers.kibbleJSON.Broker(config)
-    
+
     orgNo = 0
     sourceNo = 0
     for org in broker.organisations():
         if not args.org or args.org == org.id:
             pprint("Processing organisation %s" % org.id)
             orgNo += 1
-            
+
             # Compile source list
             # If --age is passed, only append source that either
             # have never been scanned, or have been scanned more than
@@ -161,21 +170,21 @@ def main():
                     if not args.source or (args.source == source['sourceID']) or (args.source == source['sourceURL']):
                         PENDING_OBJECTS.append(source)
                 sourceNo += len(PENDING_OBJECTS)
-            
+
             # Start up some threads equal to number of cores on the box,
             # but no more than 4. We don't want an IOWait nightmare.
             threads = []
             core_count = min((4, int( multiprocessing.cpu_count() )))
             for i in range(0, core_count):
-                sThread = scanThread(broker, org, i+1, args.type, args.exclude)
+                sThread = scanThread(broker, org, i+1, args.type, args.exclude, args.filter)
                 sThread.start()
                 threads.append(sThread)
-            
+
             # Wait for them all to finish.
             for t in threads:
                 t.join()
-        
+
     pprint("All done scanning for now, found %i organisations and %i sources to process." % (orgNo, sourceNo))
-    
+
 if __name__ == '__main__':
     main()
diff --git a/src/plugins/scanners/jenkins.py b/src/plugins/scanners/jenkins.py
index c09920c..579ba8b 100644
--- a/src/plugins/scanners/jenkins.py
+++ b/src/plugins/scanners/jenkins.py
@@ -20,18 +20,21 @@
 import re
 import json
 import hashlib
-import plugins.utils.jsonapi
+
 import threading
 import requests.exceptions
 import os
 import urllib.parse
 
+from plugins.utils import jsonapi
+
+
 """
 This is the Kibble Jenkins scanner plugin.
 """
 
 title = "Scanner for Jenkins CI"
-version = "0.1.0"
+version = "0.2.0"
 
 def accepts(source):
     """ Determines whether we want to handle this source """
@@ -42,39 +45,43 @@ def accepts(source):
 
 def scanJob(KibbleBit, source, job, creds):
     """ Scans a single job for activity """
-    NOW = int(datetime.datetime.utcnow().timestamp())
+    NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
     jname = job['name']
     if job.get('folder'):
         jname = job.get('folder') + '-' + job['name']
-    dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], jname) ).encode('ascii', errors='replace')).hexdigest()
+    dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], jname) )
+                            .encode('ascii', errors='replace')).hexdigest()
     found = True
     doc= None
     parseIt = False
     found = KibbleBit.exists('cijob', dhash)
-    
+
     # Get $jenkins/job/$job-name/json...
     jobURL = "%s/api/json?depth=2&tree=builds[number,status,timestamp,id,result,duration]" % job['fullURL']
     KibbleBit.pprint(jobURL)
-    jobjson = plugins.utils.jsonapi.get(jobURL, auth = creds)
-    
+
+    jobjson = jsonapi.get(jobURL, auth = creds)
+
     # If valid JSON, ...
     if jobjson:
+        print("jobjson builds: %s" %( jobjson))
         for build in jobjson.get('builds', []):
-            buildhash = hashlib.sha224( ("%s-%s-%s-%s" % (source['organisation'], source['sourceURL'], jname, build['id']) ).encode('ascii', errors='replace')).hexdigest()
+            buildhash = hashlib.sha224( ("%s-%s-%s-%s" % (source['organisation'], source['sourceURL'], jname, build['id']) )
+                                        .encode('ascii', errors='replace')).hexdigest()
             builddoc = None
             try:
                 builddoc = KibbleBit.get('ci_build', buildhash)
             except:
                 pass
-            
+
             # If this build already completed, no need to parse it again
             if builddoc and builddoc.get('completed', False):
                 continue
-            
+
             KibbleBit.pprint("[%s-%s] This is new or pending, analyzing..." % (jname, build['id']))
-            
+
             completed = True if build['result'] else False
-            
+
             # Estimate time spent in queue
             queuetime = 0
             TS = int(build['timestamp']/1000)
@@ -82,7 +89,7 @@ def scanJob(KibbleBit, source, job, creds):
                 queuetime = builddoc.get('queuetime', 0)
             if not completed:
                 queuetime = NOW - TS
-            
+
             # Get build status (success, failed, canceled etc)
             status = 'building'
             if build['result'] in ['SUCCESS', 'STABLE']:
@@ -91,13 +98,13 @@ def scanJob(KibbleBit, source, job, creds):
                 status = 'failed'
             if build['result'] in ['ABORTED']:
                 status = 'aborted'
-            
+
             # Calc when the build finished (jenkins doesn't show this)
             if completed:
                 FIN = int(build['timestamp'] + build['duration']) / 1000
             else:
                 FIN = 0
-                
+
             doc = {
                 # Build specific data
                 'id': buildhash,
@@ -111,7 +118,7 @@ def scanJob(KibbleBit, source, job, creds):
                 'started': int(build['timestamp']/1000),
                 'ci': 'jenkins',
                 'queuetime': queuetime,
-                
+
                 # Standard docs values
                 'sourceID': source['sourceID'],
                 'organisation': source['organisation'],
@@ -120,7 +127,7 @@ def scanJob(KibbleBit, source, job, creds):
             KibbleBit.append('ci_build', doc)
         # Yay, it worked!
         return True
-    
+
     # Boo, it failed!
     KibbleBit.pprint("Fetching job data failed!")
     return False
@@ -135,7 +142,7 @@ def __init__(self, block, KibbleBit, source, creds, jobs):
         self.creds = creds
         self.source = source
         self.jobs = jobs
-        
+
     def run(self):
         badOnes = 0
         while len(self.jobs) > 0 and badOnes <= 50:
@@ -160,7 +167,8 @@ def run(self):
                     self.KibbleBit.pprint("Too many errors, bailing!")
                     self.source['steps']['issues'] = {
                         'time': time.time(),
-                        'status': 'Too many errors while parsing at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
+                        'status': 'Too many errors while parsing at '
+                                  + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
                         'running': False,
                         'good': False
                     }
@@ -169,11 +177,12 @@ def run(self):
             else:
                 badOnes = 0
 
-def scan(KibbleBit, source):
+def scan(KibbleBit, source, filter=None):
     # Simple URL check
     jenkins = re.match(r"(https?://.+)", source['sourceURL'])
     if jenkins:
-        
+        if not 'steps' in source:
+            source['steps'] = {}
         source['steps']['jenkins'] = {
             'time': time.time(),
             'status': 'Parsing Jenkins job changes...',
@@ -181,7 +190,7 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         badOnes = 0
         pendingJobs = []
         KibbleBit.pprint("Parsing Jenkins activity at %s" % source['sourceURL'])
@@ -192,32 +201,41 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         # Jenkins may neeed credentials
         creds = None
-        if source['creds'] and 'username' in source['creds'] and source['creds']['username'] and len(source['creds']['username']) > 0:
+        if ('creds' in source and source['creds'] and 'username' in source['creds'] and source['creds']['username']
+                and len(source['creds']['username']) > 0):
             creds = "%s:%s" % (source['creds']['username'], source['creds']['password'])
-            
+
+        if not creds:
+            KibbleBit.pprint("JENKINS with no %s authentication." % source['sourceURL'])
+
         # Get the job list
-        sURL = source['sourceURL']
-        KibbleBit.pprint("Getting job list...")
-        jobsjs = plugins.utils.jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % sURL , auth = creds)
-        
+        sURL: str = source['sourceURL']
+        #print("queue URL:", sURL)
+        KibbleBit.pprint("Getting jenkins job list..." )
+        jobsjs = jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % sURL , auth = creds)
+        #print ("jobsjs:", jobsjs)
+
         # Get the current queue
+        # This is always at the root of the build instance
         KibbleBit.pprint("Getting job queue...")
-        queuejs = plugins.utils.jsonapi.get("%s/queue/api/json?depth=1" % sURL , auth = creds)
-        
+
+        queuejs = jsonapi.get("%s/queue/api/json?depth=1" % sURL , auth = creds)
+
         # Save queue snapshot
-        NOW = int(datetime.datetime.utcnow().timestamp())
-        queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceURL'], int(time.time())) ).encode('ascii', errors='replace')).hexdigest()
-        
-        
+        NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+        queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceURL'], int(time.time())) )
+                                    .encode('ascii', errors='replace')).hexdigest()
+
+
         # Scan queue items
         blocked = 0
         stuck = 0
         totalqueuetime = 0
         items = queuejs.get('items', [])
-        
+
         for item in items:
             if item['blocked']:
                 blocked += 1
@@ -225,11 +243,11 @@ def scan(KibbleBit, source):
                 stuck += 1
             if 'inQueueSince' in item:
                 totalqueuetime += (NOW - int(item['inQueueSince']/1000))
-        
+
         avgqueuetime = totalqueuetime / max(1, len(items))
-        
+
         # Count how many jobs are building, find any folders...
-        actual_jobs, building = get_all_jobs(KibbleBit, source, jobsjs.get('jobs', []), creds)
+        actual_jobs, building = get_all_jobs(KibbleBit, source, jobsjs.get('jobs', []), filter, creds)
 
         # Write up a queue doc
         queuedoc = {
@@ -242,18 +260,18 @@ def scan(KibbleBit, source):
             'stuck': stuck,
             'avgwait': avgqueuetime,
             'ci': 'jenkins',
-            
+
             # Standard docs values
             'sourceID': source['sourceID'],
             'organisation': source['organisation'],
             'upsert': True,
         }
         KibbleBit.append('ci_queue', queuedoc)
-        
-        
+
+
         pendingJobs = actual_jobs
         KibbleBit.pprint("Found %u jobs in Jenkins" % len(pendingJobs))
-        
+
         threads = []
         block = threading.Lock()
         KibbleBit.pprint("Scanning jobs using 4 sub-threads")
@@ -261,32 +279,48 @@ def scan(KibbleBit, source):
             t = jenkinsThread(block, KibbleBit, source, creds, pendingJobs)
             threads.append(t)
             t.start()
-        
+
         for t in threads:
             t.join()
 
-        # We're all done, yaay        
+        # We're all done, yaay
         KibbleBit.pprint("Done scanning %s" % source['sourceURL'])
 
+        partial = "(filtered) " if filter else ''
         source['steps']['issues'] = {
             'time': time.time(),
-            'status': 'Jenkins successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
+            'status': 'Jenkins successfully '+ partial+'scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
             'running': False,
             'good': True
         }
         KibbleBit.updateSource(source)
-    
-def get_all_jobs(KibbleBit, source, joblist, creds):
+
+def get_all_jobs(KibbleBit, source, joblist, job_filter, creds):
     real_jobs = []
     building = 0
     for job in joblist:
+
+        #print("jobFilter: ", job_filter)
+        if (job_filter and job['name'] not in job_filter):
+            print("Skipping job", job['name'])
+            continue
+
         # Is this a job folder?
         jclass = job.get('_class')
-        if jclass in ['jenkins.branch.OrganizationFolder', 'org.jenkinsci.plugins.workflow.multibranch.WorkflowMultiBranchProject']:
+
+        #KibbleBit.pprint("%s has class %s..." % (job['name'], jclass))
+
+        if jclass in ['jenkins.branch.OrganizationFolder',
+                      'org.jenkinsci.plugins.workflow.multibranch.WorkflowMultiBranchProject',
+                      'org.jenkinsci.plugins.workflow.job.WorkflowJob',
+                      'com.cloudbees.hudson.plugins.folder.Folder']:
             KibbleBit.pprint("%s is a jobs folder, expanding..." % job['name'])
+
             csURL = '%s/job/%s' % (source['sourceURL'], urllib.parse.quote(job['name'].replace('/', '%2F')))
+
             try:
-                child_jobs = plugins.utils.jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % csURL,
+
+                child_jobs = jsonapi.get("%s/api/json?tree=jobs[name,color]&depth=1" % csURL,
                                                        auth=creds)
                 csource = dict(source)
                 csource['sourceURL'] = csURL
@@ -294,7 +328,9 @@ def get_all_jobs(KibbleBit, source, joblist, creds):
                     csource['folder'] = job['name']
                 else:
                     csource['folder'] += '-' + job['name']
-                cjobs, cbuilding = get_all_jobs(KibbleBit, csource, child_jobs.get('jobs', []), creds)
+                cjobs, cbuilding = get_all_jobs(KibbleBit, csource, child_jobs.get('jobs', []), job_filter, creds)
+
+                KibbleBit.pprint("%s (job/folder) entries found." % (len(cjobs)) )
                 building += cbuilding
                 for cjob in cjobs:
                     real_jobs.append(cjob)
@@ -308,5 +344,6 @@ def get_all_jobs(KibbleBit, source, joblist, creds):
                 building += 1
             job['fullURL'] = '%s/job/%s' % (source['sourceURL'], urllib.parse.quote(job['name'].replace('/', '%2F')))
             job['folder'] = source.get('folder')
+            #KibbleBit.pprint("Found job %s ..." % job)
             real_jobs.append(job)
     return real_jobs, building
diff --git a/src/plugins/scanners/travis.py b/src/plugins/scanners/travis.py
index a42dae9..e6cfadb 100644
--- a/src/plugins/scanners/travis.py
+++ b/src/plugins/scanners/travis.py
@@ -41,19 +41,19 @@ def accepts(source):
 
 def scanJob(KibbleBit, source, bid, token, TLD):
     """ Scans a single job for activity """
-    NOW = int(datetime.datetime.utcnow().timestamp())
+    NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
     dhash = hashlib.sha224( ("%s-%s-%s" % (source['organisation'], source['sourceURL'], bid) ).encode('ascii', errors='replace')).hexdigest()
     found = True
     doc= None
     parseIt = False
     found = KibbleBit.exists('cijob', dhash)
-    
+
     # Get the job data
     pages = 0
     offset = 0
     last_page = False
     oURL = "https://api.travis-ci.%s/repo/%s/builds" % (TLD, bid)
-    
+
     # For as long as pagination makes sense...
     while last_page == False:
         bURL = "https://api.travis-ci.%s/repo/%s/builds?limit=100&offset=%u" % (TLD, bid, offset)
@@ -65,13 +65,13 @@ def scanJob(KibbleBit, source, bid, token, TLD):
             if repojs['@pagination']['is_last']:
                 KibbleBit.pprint("Assuming this is the last page we need (travis says so)")
                 last_page = True
-                
+
             KibbleBit.pprint("%s has %u builds done" % (bURL, repojs['@pagination']['count']))
-            
+
             # BREAKER: If we go past count somehow, and travis doesn't say so, bork anyway
             if repojs['@pagination']['count'] < offset:
                 return True
-            
+
             offset += 100
             for build in repojs.get('builds', []):
                 buildID = build['id']
@@ -81,15 +81,15 @@ def scanJob(KibbleBit, source, bid, token, TLD):
                 duration = build['duration']
                 completed = True if duration else False
                 duration = duration or 0
-                
-                
+
+
                 buildhash = hashlib.sha224( ("%s-%s-%s-%s" % (source['organisation'], source['sourceURL'], bid, buildID) ).encode('ascii', errors='replace')).hexdigest()
                 builddoc = None
                 try:
                     builddoc = KibbleBit.get('ci_build', buildhash)
                 except:
                     pass
-                
+
                 # If this build already completed, no need to parse it again
                 if builddoc and builddoc.get('completed', False):
                     # If we're on page > 1 and we've seen a completed build, assume
@@ -99,7 +99,7 @@ def scanJob(KibbleBit, source, bid, token, TLD):
                         last_page = True
                         break
                     continue
-                
+
                 # Get build status (success, failed, canceled etc)
                 status = 'building'
                 if build['state'] in ['finished', 'passed']:
@@ -108,17 +108,17 @@ def scanJob(KibbleBit, source, bid, token, TLD):
                     status = 'failed'
                 if build['state'] in ['aborted', 'canceled']:
                     status = 'aborted'
-                
+
                 FIN = 0
                 STA = 0
                 if finishedAt:
                     FIN = datetime.datetime.strptime(finishedAt, "%Y-%m-%dT%H:%M:%SZ").timestamp()
                 if startedAt:
                     STA = int(datetime.datetime.strptime(startedAt, "%Y-%m-%dT%H:%M:%SZ").timestamp())
-        
+
                 # We don't know how to calc queues yet, set to 0
                 queuetime = 0
-    
+
                 doc = {
                     # Build specific data
                     'id': buildhash,
@@ -132,7 +132,7 @@ def scanJob(KibbleBit, source, bid, token, TLD):
                     'started': STA,
                     'ci': 'travis',
                     'queuetime': queuetime,
-                    
+
                     # Standard docs values
                     'sourceID': source['sourceID'],
                     'organisation': source['organisation'],
@@ -158,7 +158,7 @@ def __init__(self, block, KibbleBit, source, token, jobs, TLD):
         self.source = source
         self.jobs = jobs
         self.tld = TLD
-        
+
     def run(self):
         badOnes = 0
         while len(self.jobs) > 0 and badOnes <= 50:
@@ -201,7 +201,7 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         badOnes = 0
         pendingJobs = []
         KibbleBit.pprint("Parsing Travis activity at %s" % source['sourceURL'])
@@ -212,7 +212,7 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-        
+
         # Travis needs a token
         token = None
         if source['creds'] and 'token' in source['creds'] and source['creds']['token'] and len(source['creds']['token']) > 0:
@@ -220,14 +220,14 @@ def scan(KibbleBit, source):
         else:
             KibbleBit.pprint("Travis CI requires a token to work!")
             return False
-            
+
         # Get the job list, paginated
         sURL = source['sourceURL']
-        
+
         # Used for pagination
         jobs = 100
         offset = 0
-        
+
         # Counters; builds queued, running and total jobs
         queued = 0 # We don't know how to count this yet
         building = 0
@@ -235,16 +235,16 @@ def scan(KibbleBit, source):
         blocked = 0 # Dunno how to count yet
         stuck = 0 # Ditto
         avgqueuetime = 0 # Ditto, fake it
-        
+
         maybeQueued = []
         while jobs == 100:
             URL = "https://api.travis-ci.%s/repos?repository.active=true&sort_by=current_build:desc&offset=%u&limit=100&include=repository.last_started_build" % (TLD, offset)
             offset += 100
             r = requests.get(URL, headers = {'Travis-API-Version': '3', 'Authorization': "token %s" % token})
-            
+
             if r.status_code != 200:
                 KibbleBit.pprint("Travis did not return a 200 Okay, bad token?!")
-                
+
                 source['steps']['travis'] = {
                     'time': time.time(),
                     'status': 'Travis CI scan failed at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time()) + ". Bad token??!"),
@@ -253,8 +253,8 @@ def scan(KibbleBit, source):
                 }
                 KibbleBit.updateSource(source)
                 return
-            
-            
+
+
             # For each build job
             js = r.json()
             for repo in js['repositories']:
@@ -265,15 +265,15 @@ def scan(KibbleBit, source):
                     if cb['state'] in ['started','created', 'queued', 'pending']:
                         for job in cb.get('jobs', []):
                             maybeQueued.append(job['id'])
-                        
-                
+
+
                 # Queue up build jobs for the threaded scanner
                 bid = repo['id']
                 pendingJobs.append(bid)
-            
+
             jobs = len(js['repositories'])
             KibbleBit.pprint("Scanned %u jobs..." % total)
-            
+
         # Find out how many building and pending jobs
         for jobID in maybeQueued:
             URL = "https://api.travis-ci.%s/job/%u" % (TLD, jobID)
@@ -288,11 +288,11 @@ def scan(KibbleBit, source):
                     blocked += 1 # Queued in Travis generally means a job can't find an executor, and thus is blocked.
                     KibbleBit.pprint("Job %u is pending" % jobID)
         KibbleBit.pprint("%u building, %u queued..." % (building, queued))
-        
+
         # Save queue snapshot
-        NOW = int(datetime.datetime.utcnow().timestamp())
+        NOW = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
         queuehash = hashlib.sha224( ("%s-%s-queue-%s" % (source['organisation'], source['sourceURL'], int(time.time())) ).encode('ascii', errors='replace')).hexdigest()
-        
+
         # Write up a queue doc
         queuedoc = {
             'id': queuehash,
@@ -304,17 +304,17 @@ def scan(KibbleBit, source):
             'stuck': stuck,
             'avgwait': avgqueuetime,
             'ci': 'travis',
-            
+
             # Standard docs values
             'sourceID': source['sourceID'],
             'organisation': source['organisation'],
             'upsert': True,
         }
         KibbleBit.append('ci_queue', queuedoc)
-        
-        
+
+
         KibbleBit.pprint("Found %u jobs in Travis" % len(pendingJobs))
-        
+
         threads = []
         block = threading.Lock()
         KibbleBit.pprint("Scanning jobs using 4 sub-threads")
@@ -322,11 +322,11 @@ def scan(KibbleBit, source):
             t = travisThread(block, KibbleBit, source, token, pendingJobs, TLD)
             threads.append(t)
             t.start()
-        
+
         for t in threads:
             t.join()
 
-        # We're all done, yaay        
+        # We're all done, yaay
         KibbleBit.pprint("Done scanning %s" % source['sourceURL'])
 
         source['steps']['travis'] = {
@@ -336,4 +336,3 @@ def scan(KibbleBit, source):
             'good': True
         }
         KibbleBit.updateSource(source)
-    
\ No newline at end of file
diff --git a/src/plugins/utils/jsonapi.py b/src/plugins/utils/jsonapi.py
index 04d173f..9405e2f 100644
--- a/src/plugins/utils/jsonapi.py
+++ b/src/plugins/utils/jsonapi.py
@@ -40,6 +40,7 @@ def get(url, cookie = None, auth = None, token = None, retries = 5, timeout = 30
         headers["Authorization"] = "token %s" % token
     if cookie:
         headers["Cookie"] = cookie
+    # print("fetching url %s" % url)
     rv = requests.get(url, headers = headers, timeout = (CONNECT_TIMEOUT, timeout))
     # Some services may be rate limited. We'll try sleeping it off in 60 second
     # intervals for a max of five minutes, then give up.
@@ -85,4 +86,3 @@ def post(url, data, cookie = None, auth = None):
     rv = requests.post(url, headers = headers, json = data)
     js = rv.json()
     return js
-

From bc5df1b77faa2699e8343490dfd91c94f3d812e9 Mon Sep 17 00:00:00 2001
From: Georg Kallidis <gk@apache.org>
Date: Wed, 11 Dec 2024 16:19:38 +0100
Subject: [PATCH 9/9] Provide a simple build environment using setuptools in
 pyproject.toml; add info about packaging in README.md; Add
 conf/config.yaml.sample and ignore config.yaml in .gitignore. Fix pre_commit
 package name to underscore in requirements.

---
 .gitignore                               | 34 ++++++++++++++
 README.md                                | 22 +++++++--
 conf/{config.yaml => config.yaml.sample} |  0
 pyproject.toml                           | 60 ++++++++++++++++++++++++
 requirements.txt                         |  2 +-
 5 files changed, 112 insertions(+), 6 deletions(-)
 create mode 100644 .gitignore
 rename conf/{config.yaml => config.yaml.sample} (100%)
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..83d5114
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,34 @@
+# Configuration file:
+/conf/config.yaml
+
+# Distribution / packaging
+build/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+*.egg-info/
+/dist/
+MANIFEST
+
+# Python cache / compiled files:
+__pycache__/
+*.py[cod]
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Environments
+/.venv*/
+/venv*/
+/.env*/
+/env*/
+
+
+# JetBrains IDE
+/.idea/
diff --git a/README.md b/README.md
index 21cea0c..9883b1e 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,10 @@ The Kibble Scanners collect information for the Kibble Suite.
 
  - Edit conf/config.yaml to match your Kibble service
 
+### Dependencies
+
+ - a running Elasticsearch server
+
 ## How to run:
 
  - On a daily/weekly/whatever basis, run in folder src: `python3 kibble-scanner.py`.
@@ -75,7 +79,9 @@ The Kibble Scanners collect information for the Kibble Suite.
  - python3-certifi
  - python3-yaml
 
- ## Build environment
+### Testing
+
+  TBD
 
 ### Pre-commit
 
@@ -98,13 +104,19 @@ The Kibble Scanners collect information for the Kibble Suite.
   If installed the pre-commit reads the configuration, and will check on the hooks, currently pre-comit and pre-push.
   While the checks are not satisfied, just rerun the commit command until the hook checks are passed.
 
-### Testing
-
-  TBD
 
 ### Project build
 
-  TBD
+  After installation of the build tool
+
+    pip install -q build
+
+  build the project by running
+
+    python -m build
+
+  Find more information in project.toml file and [Setuptools](https://setuptools.pypa.io/).
 
 # Get involved
+
   TBD. Please see https://kibble.apache.org/ for details!
diff --git a/conf/config.yaml b/conf/config.yaml.sample
similarity index 100%
rename from conf/config.yaml
rename to conf/config.yaml.sample
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..708dca3
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,60 @@
+##############################
+# Python packaging settings: #
+
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kibble-scanners"
+description = "The Kibble Scanners collect information for the Kibble Suite. Apache Kibble is a tool to collect, aggregate and visualize data about any software project that uses commonly known tools."
+version = "1.0.0"
+
+dependencies = [
+    "python-dateutil",
+    "certifi",
+     "requests",
+     "psutil",
+    "elasticsearch",
+    "PyYAML>=5.2",
+]
+
+requires-python = ">=3.9, <4.0"
+
+authors = [
+    { name = "Apache Software Foundation", email = "dev@kibble.apache.org" },
+]
+maintainers = [
+    { name = "Apache Software Foundation", email="dev@kibble.apache.org" },
+]
+keywords = [
+    "kibble-scanners", "data" ]
+
+license = { text = "Apache License, Version 2.0" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Environment :: Console",
+    "Framework :: Apache Kibble-Scanners",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: System Administrators",
+    "Topic :: System :: Monitoring"
+]
+
+dynamic = [
+    "readme"
+]
+
+[project.urls]
+repository = "https://github.com/apache/kibble-scanners.git"
+"Bug Tracker" = "https://github.com/apache/kibble-scanners/issues"
+
+
+[tool.setuptools]
+dynamic = { readme = { file = ["README.md"] } }
+packages.find = { where = ["src"] }
diff --git a/requirements.txt b/requirements.txt
index 668c04d..6755b2d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ psutil
 python-dateutil
 requests
 pyyaml
-pre-commit
+pre_commit