From c0d83b7765d6f0c5dd9a4968d2e4c6b0852cd0b5 Mon Sep 17 00:00:00 2001 From: Bill OConnor Date: Tue, 26 Oct 2021 19:47:16 +0000 Subject: [PATCH 1/6] Add indexing of video metadata records. --- cdx_writer/command.py | 6 +++++- cdx_writer/video.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 cdx_writer/video.py diff --git a/cdx_writer/command.py b/cdx_writer/command.py index 87895e8..56c2395 100644 --- a/cdx_writer/command.py +++ b/cdx_writer/command.py @@ -12,6 +12,7 @@ from .dispatcher import DefaultDispatcher, AllDispatcher from .screenshot import ScreenshotDispatcher +from .video import VideoDispatcher from .exclusion import PrefixExclusion from .handler import RecordHandler from .archive import ArchiveRecordReader @@ -21,7 +22,8 @@ class CDX_Writer(object): 'default': DefaultDispatcher(), 'all': AllDispatcher(), - 'screenshot': ScreenshotDispatcher() + 'screenshot': ScreenshotDispatcher(), + 'video' : VideoDispatcher(), } def __init__(self, in_file, out_file=sys.stdout, format="N b a m s k r M S V g", @@ -282,6 +284,8 @@ def main(args=None): help="By default we only index http responses. Use this flag to index all WARC records in the file") parser.add_option("--screenshot-mode", dest="dispatch_mode", action="store_const", const="screenshot", help="Special Wayback Machine mode for handling WARCs containing screenshots") + parser.add_option("--video-mode", dest="dispatch_mode", action="store_const", const="video", + help="Special Wayback Machine mode for handling WARCs containing video") parser.add_option("--exclude-list", dest="exclude_list", help="File containing url prefixes to exclude") parser.add_option("--stats-file", dest="stats_file", help="Output json file containing statistics") parser.add_option("--no-host-massage", dest="canonicalizer_options", diff --git a/cdx_writer/video.py b/cdx_writer/video.py new file mode 100644 index 0000000..178db8d --- /dev/null +++ b/cdx_writer/video.py @@ -0,0 +1,36 @@ +from .handler import RecordHandler +from .dispatcher import DefaultDispatcher + +class VideoMetaHandler(RecordHandler): + @property + def original_url(self): + return 'http://wayback-metadata.archive.org/video-meta/' + self.safe_url() + + @property + def massaged_url(self): + return 'org,archive,wayback-metadata)/video-meta/' + self.urlkey(self.safe_url()) + + @property + def mime_type(self): + return self._normalize_content_type('application/json;generator-youtube-dl') + +class ArchivedVideoHandler(RecordHandler): + @property + def original_url(self): + return 'http://wayback-metadata.archive.org/video-meta/' + self.safe_url() + + @property + def massaged_url(self): + return 'org,archive,wayback-metadata)/archived-video/' + self.urlkey(self.safe_url()) + + @property + def mime_type(self): + return self._normalize_content_type('application/json;generator=yt-archiver') + +class VideoDispatcher(DefaultDispatcher): + def dispatch_metadata(self, record, env): + content_type = record.content_type + + if content_type and content_type.startswith('application/json;generator=yt-archiver'): + return ArchivedVideoHandler + return None From e50d612efce339997c407a5fe667714a1b90a32e Mon Sep 17 00:00:00 2001 From: Bill OConnor Date: Tue, 26 Oct 2021 12:58:35 -0700 Subject: [PATCH 2/6] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c19af75..7e01c59 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Options: to index all WARC records in the file --screenshot-mode Special Wayback Machine mode for handling WARCs containing screenshots + --video-mode Special Wayback Machine mode for handling WARCs containing video --exclude-list=EXCLUDE_LIST File containing url prefixes to exclude --stats-file=STATS_FILE Output json file containing statistics From d8ad87ae6ea1dbc706e618a615d3d92ade7a59ce Mon Sep 17 00:00:00 2001 From: Bill OConnor Date: Thu, 28 Oct 2021 17:12:46 +0000 Subject: [PATCH 3/6] Index the video metatfile. --- cdx_writer/video.py | 16 ++-------------- setup.py | 2 +- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/cdx_writer/video.py b/cdx_writer/video.py index 178db8d..420955b 100644 --- a/cdx_writer/video.py +++ b/cdx_writer/video.py @@ -14,23 +14,11 @@ def massaged_url(self): def mime_type(self): return self._normalize_content_type('application/json;generator-youtube-dl') -class ArchivedVideoHandler(RecordHandler): - @property - def original_url(self): - return 'http://wayback-metadata.archive.org/video-meta/' + self.safe_url() - - @property - def massaged_url(self): - return 'org,archive,wayback-metadata)/archived-video/' + self.urlkey(self.safe_url()) - - @property - def mime_type(self): - return self._normalize_content_type('application/json;generator=yt-archiver') class VideoDispatcher(DefaultDispatcher): def dispatch_metadata(self, record, env): content_type = record.content_type - if content_type and content_type.startswith('application/json;generator=yt-archiver'): - return ArchivedVideoHandler + if content_type and content_type.startswith('application/json;generator-youtube-dl'): + return VideoMetaHandler return None diff --git a/setup.py b/setup.py index b7f34da..126d16b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='CDX-Writer', - version='0.4.4.4', + version='0.4.4.5', packages=find_packages(), install_requires=[ 'warctools>=4.10.0', From 678a9e94cb02f82dc4ec60d356fa4a549d1d13d2 Mon Sep 17 00:00:00 2001 From: Bill OConnor Date: Wed, 5 Jan 2022 23:44:32 +0000 Subject: [PATCH 4/6] Remove video metadata as option and make run all the time. --- cdx_writer/command.py | 3 --- cdx_writer/dispatcher.py | 16 +++++++++++++++- cdx_writer/handler.py | 13 +++++++++++++ cdx_writer/video.py | 24 ------------------------ 4 files changed, 28 insertions(+), 28 deletions(-) delete mode 100644 cdx_writer/video.py diff --git a/cdx_writer/command.py b/cdx_writer/command.py index 56c2395..de29253 100644 --- a/cdx_writer/command.py +++ b/cdx_writer/command.py @@ -23,7 +23,6 @@ class CDX_Writer(object): 'all': AllDispatcher(), 'screenshot': ScreenshotDispatcher(), - 'video' : VideoDispatcher(), } def __init__(self, in_file, out_file=sys.stdout, format="N b a m s k r M S V g", @@ -284,8 +283,6 @@ def main(args=None): help="By default we only index http responses. Use this flag to index all WARC records in the file") parser.add_option("--screenshot-mode", dest="dispatch_mode", action="store_const", const="screenshot", help="Special Wayback Machine mode for handling WARCs containing screenshots") - parser.add_option("--video-mode", dest="dispatch_mode", action="store_const", const="video", - help="Special Wayback Machine mode for handling WARCs containing video") parser.add_option("--exclude-list", dest="exclude_list", help="File containing url prefixes to exclude") parser.add_option("--stats-file", dest="stats_file", help="Output json file containing statistics") parser.add_option("--no-host-massage", dest="canonicalizer_options", diff --git a/cdx_writer/dispatcher.py b/cdx_writer/dispatcher.py index e6ec7ae..f6649ec 100644 --- a/cdx_writer/dispatcher.py +++ b/cdx_writer/dispatcher.py @@ -1,5 +1,5 @@ from .handler import (RecordHandler, ResponseHandler, RevisitHandler, - ResourceHandler, FtpHandler, WarcinfoHandler) + ResourceHandler, FtpHandler, WarcinfoHandler, VideoMetaHandler) __all__ = [ 'RecordDispatcher', 'DefaultDispatcher', 'AllDispatcher' @@ -71,6 +71,13 @@ def dispatch_resource(self, record, env): elif record.url.startswith(('http://', 'https://')): return ResourceHandler return None + + def dispatch_metadata(self, record, env): + content_type = record.content_type + + if content_type and content_type.startswith('application/json;generator-youtube-dl'): + return VideoMetaHandler + return None class AllDispatcher(DefaultDispatcher): @@ -87,5 +94,12 @@ def dispatch_resource(self, record, env): def dispatch_warcinfo(self, record, env): return WarcinfoHandler + def dispatch_metadata(self, record, env): + content_type = record.content_type + + if content_type and content_type.startswith('application/json;generator-youtube-dl'): + return VideoMetaHandler + return None + def dispatch_any(self, record, env): return RecordHandler diff --git a/cdx_writer/handler.py b/cdx_writer/handler.py index 3601e9e..1e29d4f 100644 --- a/cdx_writer/handler.py +++ b/cdx_writer/handler.py @@ -864,3 +864,16 @@ def new_style_checksum(self): return digest.replace('sha1:', '') return self.content.content_digest() + +class VideoMetaHandler(RecordHandler): + @property + def original_url(self): + return 'http://wayback-metadata.archive.org/video-meta/' + self.safe_url() + + @property + def massaged_url(self): + return 'org,archive,wayback-metadata)/video-meta/' + self.urlkey(self.safe_url()) + + @property + def mime_type(self): + return self._normalize_content_type('application/json;generator-youtube-dl') diff --git a/cdx_writer/video.py b/cdx_writer/video.py deleted file mode 100644 index 420955b..0000000 --- a/cdx_writer/video.py +++ /dev/null @@ -1,24 +0,0 @@ -from .handler import RecordHandler -from .dispatcher import DefaultDispatcher - -class VideoMetaHandler(RecordHandler): - @property - def original_url(self): - return 'http://wayback-metadata.archive.org/video-meta/' + self.safe_url() - - @property - def massaged_url(self): - return 'org,archive,wayback-metadata)/video-meta/' + self.urlkey(self.safe_url()) - - @property - def mime_type(self): - return self._normalize_content_type('application/json;generator-youtube-dl') - - -class VideoDispatcher(DefaultDispatcher): - def dispatch_metadata(self, record, env): - content_type = record.content_type - - if content_type and content_type.startswith('application/json;generator-youtube-dl'): - return VideoMetaHandler - return None From c88c545db1d058f08c9cac07c371dd8a3707a327 Mon Sep 17 00:00:00 2001 From: Bill OConnor Date: Thu, 13 Jan 2022 15:00:11 -0800 Subject: [PATCH 5/6] Remove redundant dispatch_metadata --- cdx_writer/dispatcher.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cdx_writer/dispatcher.py b/cdx_writer/dispatcher.py index f6649ec..ea3591d 100644 --- a/cdx_writer/dispatcher.py +++ b/cdx_writer/dispatcher.py @@ -94,12 +94,5 @@ def dispatch_resource(self, record, env): def dispatch_warcinfo(self, record, env): return WarcinfoHandler - def dispatch_metadata(self, record, env): - content_type = record.content_type - - if content_type and content_type.startswith('application/json;generator-youtube-dl'): - return VideoMetaHandler - return None - def dispatch_any(self, record, env): return RecordHandler From 007f18af2c85e5a5e82d26c57112e83e0553194f Mon Sep 17 00:00:00 2001 From: Bill OConnor Date: Thu, 13 Jan 2022 15:03:14 -0800 Subject: [PATCH 6/6] Remove option from doc - had already been removed from code. --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 7e01c59..c19af75 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ Options: to index all WARC records in the file --screenshot-mode Special Wayback Machine mode for handling WARCs containing screenshots - --video-mode Special Wayback Machine mode for handling WARCs containing video --exclude-list=EXCLUDE_LIST File containing url prefixes to exclude --stats-file=STATS_FILE Output json file containing statistics