diff --git a/metawarc/cmds/indexer.py b/metawarc/cmds/indexer.py index ab2bcaf..9c61272 100644 --- a/metawarc/cmds/indexer.py +++ b/metawarc/cmds/indexer.py @@ -17,7 +17,7 @@ def bufcount(filename): """Count number of lines""" - f = open(filename) + f = open(filename) lines = 0 buf_size = 1024 * 1024 read_f = f.read # loop optimization @@ -26,7 +26,7 @@ def bufcount(filename): while buf: lines += buf.count('\n') buf = read_f(buf_size) - f.close() + f.close() return lines def cdx_size_counter(filename): @@ -57,33 +57,35 @@ def index_content(self, fromfile): print('CDX file found. Estimated number of WARC records %d' % (records_num)) else: print("No CDX file. Cant measure progress") - n = 0 - for record in iterator: - if record.rec_type != "response": - continue - n += 1 - if records_num is not None: - if n % THRESHOLD == 0: print('Processed %d (%0.2f%%) records' % (n, n*100.0 / records_num)) - else: - if n % THRESHOLD == 0: print('Processed %d records' % (n)) - if record.http_headers is not None: - dbrec = models.Record() - dbrec.warc_id = record.rec_headers.get_header("WARC-Record-ID").rsplit(':', 1)[-1].strip('>') - content_type = record.http_headers.get_header("content-type") - dbrec.content_type = content_type - dbrec.offset = iterator.get_record_offset() - dbrec.length = iterator.get_record_length() - dbrec.url = record.rec_headers.get_header("WARC-Target-URI") - warc_date = record.rec_headers.get_header("WARC-Date") - dbrec.rec_date = datetime.strptime(warc_date, "%Y-%m-%dT%H:%M:%S%z") - dbrec.content_length = int(record.rec_headers.get_header("Content-Length")) - dbrec.status_code = int(record.http_headers.get_statuscode()) - dbrec.headers = json.dumps(dict(record.http_headers.headers)) - dbrec.source = fromfile - dbrec.filename = dbrec.url.rsplit("?", 1)[0].rsplit("/", 1)[-1].lower() - dbrec.ext = dbrec.filename.rsplit(".", 1)[-1] if dbrec.filename.find(".") > -1 else "" - session.add(dbrec) - session.commit() + n = 0 + try: + for record in iterator: + if record.rec_type != "response": + continue + n += 1 + if records_num is not None: + if n % THRESHOLD == 0: print('Processed %d (%0.2f%%) records' % (n, n*100.0 / records_num)) + else: + if n % THRESHOLD == 0: print('Processed %d records' % (n)) + if record.http_headers is not None: + dbrec = models.Record() + dbrec.warc_id = record.rec_headers.get_header("WARC-Record-ID").rsplit(':', 1)[-1].strip('>') + content_type = record.http_headers.get_header("content-type") + dbrec.content_type = content_type + dbrec.offset = iterator.get_record_offset() + dbrec.length = iterator.get_record_length() + dbrec.url = record.rec_headers.get_header("WARC-Target-URI") + warc_date = record.rec_headers.get_header("WARC-Date") + dbrec.rec_date = datetime.strptime(warc_date, "%Y-%m-%dT%H:%M:%S%z") + dbrec.content_length = int(record.rec_headers.get_header("Content-Length")) + dbrec.status_code = int(record.http_headers.get_statuscode()) + dbrec.headers = json.dumps(dict(record.http_headers.headers)) + dbrec.source = fromfile + dbrec.filename = dbrec.url.rsplit("?", 1)[0].rsplit("/", 1)[-1].lower() + dbrec.ext = dbrec.filename.rsplit(".", 1)[-1] if dbrec.filename.find(".") > -1 else "" + session.add(dbrec) + finally: + session.commit() resp.close() def calc_stats(self, mode='mime'): @@ -95,14 +97,14 @@ def calc_stats(self, mode='mime'): engine = create_engine("sqlite:///metawarc.db", echo=False) session = Session(engine) if mode == 'mimes': - results = session.query(models.Record.content_type, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.content_type).all() + results = session.query(models.Record.content_type, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.content_type).all() title = 'Group by mime type' headers = ('mime', 'size', 'size share', 'count') elif mode == 'exts': - results = session.query(models.Record.ext, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.ext).all() + results = session.query(models.Record.ext, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.ext).all() title = 'Group by file extension' headers = ('extension', 'size', 'size share', 'count') - + reptable = Table(title=title) reptable.add_column(headers[0], justify="left", style="magenta") for key in headers[1:-1]: