diff --git a/cdx_writer/command.py b/cdx_writer/command.py index 6305006..484af74 100644 --- a/cdx_writer/command.py +++ b/cdx_writer/command.py @@ -12,6 +12,7 @@ from .dispatcher import DefaultDispatcher, AllDispatcher from .screenshot import ScreenshotDispatcher +from .video import VideoDispatcher from .exclusion import PrefixExclusion from .handler import RecordHandler from .archive import ArchiveRecordReader @@ -21,7 +22,7 @@ class CDX_Writer(object): 'default': DefaultDispatcher(), 'all': AllDispatcher(), - 'screenshot': ScreenshotDispatcher() + 'screenshot': ScreenshotDispatcher(), } def __init__(self, in_file, out_file=sys.stdout, format="N b a m s k r M S V g", diff --git a/cdx_writer/dispatcher.py b/cdx_writer/dispatcher.py index e6ec7ae..ea3591d 100644 --- a/cdx_writer/dispatcher.py +++ b/cdx_writer/dispatcher.py @@ -1,5 +1,5 @@ from .handler import (RecordHandler, ResponseHandler, RevisitHandler, - ResourceHandler, FtpHandler, WarcinfoHandler) + ResourceHandler, FtpHandler, WarcinfoHandler, VideoMetaHandler) __all__ = [ 'RecordDispatcher', 'DefaultDispatcher', 'AllDispatcher' @@ -71,6 +71,13 @@ def dispatch_resource(self, record, env): elif record.url.startswith(('http://', 'https://')): return ResourceHandler return None + + def dispatch_metadata(self, record, env): + content_type = record.content_type + + if content_type and content_type.startswith('application/json;generator-youtube-dl'): + return VideoMetaHandler + return None class AllDispatcher(DefaultDispatcher): diff --git a/cdx_writer/handler.py b/cdx_writer/handler.py index 3601e9e..1e29d4f 100644 --- a/cdx_writer/handler.py +++ b/cdx_writer/handler.py @@ -864,3 +864,16 @@ def new_style_checksum(self): return digest.replace('sha1:', '') return self.content.content_digest() + +class VideoMetaHandler(RecordHandler): + @property + def original_url(self): + return 'http://wayback-metadata.archive.org/video-meta/' + self.safe_url() + + @property + def massaged_url(self): + return 'org,archive,wayback-metadata)/video-meta/' + self.urlkey(self.safe_url()) + + @property + def mime_type(self): + return self._normalize_content_type('application/json;generator-youtube-dl')