newsirc/RSS.py at master · fbxdevs/newsirc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-

"""
RSS Class for grabbing, formatting and outputing information from streams

Note - Only 6 feeds can be scrapped at a time
Note - added # -*- coding: utf-8 -*- due to entries using that encoding
"""

import time
import os
import logging
import re
from html.parser import HTMLParser
import feedparser
import random


class MLStripper(HTMLParser):
	def __init__(self):
		self.reset()
		self.strict = False
		self.convert_charrefs = True
		self.fed = []
	def handle_data(self, d):
		self.fed.append(d)
	def get_data(self):
		return ''.join(self.fed)

def strip_tags(html):
	"""
	Strip out HTML from text
	"""
	s = MLStripper()
	try:
		s.feed(html)
		return s.get_data()
	except Exception as err:
		MAIN_LOGGER.error('Stripping HTML failed: %s', err)
		return html

class RSS:
	"""
	RSS Class
	"""
	def __init__(self, rssurls):
		"""
		Constructor method
		"""
		# rss feed url
		self.rss_urls = rssurls

		# six randomly selected urls
		self.rssurls_rand = []

		# will hold news item entries
		self.news_entries = []

		self.feeds = [] # will hold current rss feed data

		# class logging setup
		self.loggingDIR = '{}/logs/'.format(
			os.path.dirname(os.path.realpath(__file__)))

		self.logger = logging.getLogger('newsirc.RSS')
		logFileHandler = logging.FileHandler(
			time.strftime(
				"{}RSS%m%d%Y.log".format(self.loggingDIR))
		)
		logFormat = logging.Formatter(
			"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
		logFileHandler.setFormatter(logFormat)
		self.logger.addHandler(logFileHandler)

	def getNews(self, url=None):
		"""
		Used as a worker - grabs information from news rss links
		via feedparser
		"""

		print("getNews URL: {}".format(url))

		try:
			fpobj = feedparser.parse(url)
			self.news_entries = fpobj['entries']

			# shuffle entries
			self.news_entries = sorted(
				self.news_entries,
				key=lambda k: random.random()
			)
		except Exception as err:
			self.logger.error('Feedparsing failed: %s', err)


	def readFeed(self):
		"""
		Get RSS feed information
		"""
		for rssurl in self.rss_urls:
			self.getNews(rssurl)

	def printArticle(self):
		if self.news_entries:
			self.logger.info('Sending single article')
			print('News entries total {}'.format(len(self.news_entries)))
			entry = self.news_entries[0]
			try:
				newsDesc = strip_tags(entry['description']).strip()

				# if description has multiple breaks don't show it
				if len(newsDesc.split('\n')) > 1:
					newsLineOne = "\002{}".format(entry['title'].strip())
					newsLineTwo = None
					newsLineThree = "\037{}".format(entry['link'].strip())
				# check to see if description is empty or not
				elif bool(re.sub(r"\s+", "", newsDesc, flags=re.UNICODE)):
					newsLineOne = "\002{}".format(entry['title'].strip())
					newsLineTwo = "\035{}".format(newsDesc)
					newsLineThree = "\037{}".format(entry['link'].strip())
				else:
					newsLineOne = "\002{}".format(entry['title'].strip())
					newsLineTwo = None
					newsLineThree = "\037{}".format(entry['link'].strip())

				if newsLineTwo:
					newsline = [
						newsLineOne,
						newsLineTwo,
						newsLineThree
					]
				else:
					newsline = [
						newsLineOne,
						newsLineThree
					]
				self.logger.info('Sending %s\n\n', newsline)

				self.news_entries.pop(0)

				return newsline
			except KeyError as err:
				self.logger.error('KeyError with entry %s - %s', entry, err)
		else:
			self.logger.info('Article list is empty - refeeding')
			self.readFeed()
			return self.printArticle()