PodcastScraper/scraper.py at master · jwhong/PodcastScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# USAGE: This script will crawl a podbay.fm show page, follow all the episode links,
# then follow all the mp3 download links on the episode pages.
# The show URL is hardcoded in to the script, edit it at the bottom of this file.

__author__ = 'JWhong'

import sys

if (sys.version_info > (3, 0)):
    pass
else:
    print("Python 3 required!")
    raw_input("Press Enter to finish...")
    exit(0)

import urllib.request
import re
import time
import os
import traceback

class Reporter(object):
    """
    Keeps track of bytes downloaded on a particular file.
    Meant to be instantiated, then have reportHook fed in to urllib.request.urlretrieve
    """
    def __init__(self):
        self.bytes_downloaded = 0
        self.tstart = time.time()
        print("")
    def reportHook(self, count, block_size, total_size):
        self.bytes_downloaded += block_size
        #if count == 0:
            #print("Server reports target file is %.3fMB"%(total_size/(1024*1024.)))
        if (not (count%64)):
            dt = time.time()-self.tstart
            b_dl = self.bytes_downloaded/(1024*1024)
            t_dl = total_size/(1024*1024)
            sys.stdout.write("\r%04.1f%% done :: %04.2fMB/s :: %05.2fMB/%05.2fMB"%(100*self.bytes_downloaded/total_size,b_dl/dt,b_dl,t_dl))
            sys.stdout.flush()

class PodbayShowScraper(object):
    def __init__(self):
        self.mp3_matcher     = re.compile('(?<=")[^"]+\.mp3(?=")')
        self.episode_matcher = re.compile('(?<=")[^"]+podbay.fm[^"]+autostart\=1(?=")')
    def tryNTimes(self, f, n):
        """
        Generic exception catcher
        :param f: Callable to try
        :param n: number of times to try
        :return: whatever the callable returns, None on exception
        """
        rval = None
        for i in range(n):
            try:
                rval = f()
                break
            except Exception:
                print("Lambda error!  Attempt %d"%i)
                traceback.print_exc()
                rval = None
        return rval
    def scrapeEpisodePage(self, page_url):
        """
        :param target_url: A string of a podbay.fm episode page, eg. http://podbay.fm/show/354668519/e/1442458800?autostart=1
        :return:
        """
        rval = self.tryNTimes(lambda:urllib.request.urlopen(page_url), 5)
        if rval == None:
            print("Failed to scrape %s"%page_url)
            return
        payload = str(rval.read())
        urls = self.mp3_matcher.findall(payload)
        for url in urls:
            reporter = Reporter()
            split = url.rsplit('/',1)
            fname = split[1]
            if os.path.isfile(fname):
                print("Already downloaded:", fname)
                continue
            print("Downloading", url)
            self.tryNTimes(lambda:urllib.request.urlretrieve(url,fname,reporter.reportHook), 10)
            print()

    def scrapeShowPage(self, target_url):
        """
        Downloads all a show's episodes to the local directory
        :param target_url: A string of a podbay.fm podcast url, eg http://podbay.fm/show/354668519
        :return: None
        """
        print("\nStarting scrape of", target_url)
        rval = self.tryNTimes(lambda:urllib.request.urlopen(target_url), 5)
        if rval == None:
            print("Failed to scrape %s"%target_url)
            return
        payload = str(rval.read())                  # Get the website's page payload as a string
        urls = self.episode_matcher.findall(payload)# Find all matching urls
        urls = list(set(urls))                      # Remove duplicates
        print("Matched", len(urls), "urls")
        for i in range(len(urls)):
            print("Episode %d of %d"%(i+1,len(urls)))
            self.scrapeEpisodePage(urls[i])
        print("\nFinished scraping", target_url)

if __name__=="__main__":
    scraper = PodbayShowScraper()

    ############################## BELOW IS THE LINE YOU SHOULD EDIT ###################################
    scraper.scrapeShowPage("http://podbay.fm/show/216713308")
    ############################## ABOVE IS THE LINE YOU SHOULD EDIT ###################################

    input("Press Enter to finish...")