Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 34 additions & 32 deletions metawarc/cmds/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

def bufcount(filename):
"""Count number of lines"""
f = open(filename)
f = open(filename)
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
Expand All @@ -26,7 +26,7 @@ def bufcount(filename):
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
f.close()
f.close()
return lines

def cdx_size_counter(filename):
Expand Down Expand Up @@ -57,33 +57,35 @@ def index_content(self, fromfile):
print('CDX file found. Estimated number of WARC records %d' % (records_num))
else:
print("No CDX file. Cant measure progress")
n = 0
for record in iterator:
if record.rec_type != "response":
continue
n += 1
if records_num is not None:
if n % THRESHOLD == 0: print('Processed %d (%0.2f%%) records' % (n, n*100.0 / records_num))
else:
if n % THRESHOLD == 0: print('Processed %d records' % (n))
if record.http_headers is not None:
dbrec = models.Record()
dbrec.warc_id = record.rec_headers.get_header("WARC-Record-ID").rsplit(':', 1)[-1].strip('>')
content_type = record.http_headers.get_header("content-type")
dbrec.content_type = content_type
dbrec.offset = iterator.get_record_offset()
dbrec.length = iterator.get_record_length()
dbrec.url = record.rec_headers.get_header("WARC-Target-URI")
warc_date = record.rec_headers.get_header("WARC-Date")
dbrec.rec_date = datetime.strptime(warc_date, "%Y-%m-%dT%H:%M:%S%z")
dbrec.content_length = int(record.rec_headers.get_header("Content-Length"))
dbrec.status_code = int(record.http_headers.get_statuscode())
dbrec.headers = json.dumps(dict(record.http_headers.headers))
dbrec.source = fromfile
dbrec.filename = dbrec.url.rsplit("?", 1)[0].rsplit("/", 1)[-1].lower()
dbrec.ext = dbrec.filename.rsplit(".", 1)[-1] if dbrec.filename.find(".") > -1 else ""
session.add(dbrec)
session.commit()
n = 0
try:
for record in iterator:
if record.rec_type != "response":
continue
n += 1
if records_num is not None:
if n % THRESHOLD == 0: print('Processed %d (%0.2f%%) records' % (n, n*100.0 / records_num))
else:
if n % THRESHOLD == 0: print('Processed %d records' % (n))
if record.http_headers is not None:
dbrec = models.Record()
dbrec.warc_id = record.rec_headers.get_header("WARC-Record-ID").rsplit(':', 1)[-1].strip('>')
content_type = record.http_headers.get_header("content-type")
dbrec.content_type = content_type
dbrec.offset = iterator.get_record_offset()
dbrec.length = iterator.get_record_length()
dbrec.url = record.rec_headers.get_header("WARC-Target-URI")
warc_date = record.rec_headers.get_header("WARC-Date")
dbrec.rec_date = datetime.strptime(warc_date, "%Y-%m-%dT%H:%M:%S%z")
dbrec.content_length = int(record.rec_headers.get_header("Content-Length"))
dbrec.status_code = int(record.http_headers.get_statuscode())
dbrec.headers = json.dumps(dict(record.http_headers.headers))
dbrec.source = fromfile
dbrec.filename = dbrec.url.rsplit("?", 1)[0].rsplit("/", 1)[-1].lower()
dbrec.ext = dbrec.filename.rsplit(".", 1)[-1] if dbrec.filename.find(".") > -1 else ""
session.add(dbrec)
finally:
session.commit()
resp.close()

def calc_stats(self, mode='mime'):
Expand All @@ -95,14 +97,14 @@ def calc_stats(self, mode='mime'):
engine = create_engine("sqlite:///metawarc.db", echo=False)
session = Session(engine)
if mode == 'mimes':
results = session.query(models.Record.content_type, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.content_type).all()
results = session.query(models.Record.content_type, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.content_type).all()
title = 'Group by mime type'
headers = ('mime', 'size', 'size share', 'count')
elif mode == 'exts':
results = session.query(models.Record.ext, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.ext).all()
results = session.query(models.Record.ext, func.sum(models.Record.content_length), func.count(models.Record.warc_id)).group_by(models.Record.ext).all()
title = 'Group by file extension'
headers = ('extension', 'size', 'size share', 'count')

reptable = Table(title=title)
reptable.add_column(headers[0], justify="left", style="magenta")
for key in headers[1:-1]:
Expand Down