Mercurial > hg > index.fcgi > www > www-1
comparison myrss2/myrss_app.py @ 108:cffd95813b82
add myrss2
author | paulo |
---|---|
date | Sun, 24 May 2020 00:22:05 -0700 |
parents | |
children | 1a5c0fc5627a |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a1a5d4af7d3e |
---|---|
1 import io | |
2 import os | |
3 import sys | |
4 import re | |
5 import urllib.request | |
6 import urllib.error | |
7 import threading | |
8 import queue | |
9 import datetime | |
10 import time | |
11 import traceback | |
12 | |
13 import logging | |
14 LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") | |
15 logging.basicConfig( | |
16 level=getattr(logging, LOG_LEVEL), | |
17 format="%(asctime)s %(levelname)-8s %(message)s", | |
18 ) | |
19 | |
20 import xml.etree.ElementTree | |
21 import html | |
22 | |
23 from html3.html3 import HTML | |
24 | |
25 | |
26 FEEDS_FILE = "FEEDS" | |
27 CACHE_HTML_FILE = "__cache__.html" | |
28 | |
29 CACHE_LIFE = 1200 # [seconds] | |
30 MAX_ITEMS = 50 | |
31 MAX_LINK_Z = 4 | |
32 MAX_THREADS = 20 | |
33 URLOPEN_TIMEOUT = 10 # [seconds] | |
34 | |
35 | |
36 _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") | |
37 | |
38 def _parse_root_tag(root_tag): | |
39 re_match = _PARSE_ROOT_TAG_RE.match(root_tag) | |
40 | |
41 if re_match is None: | |
42 return (None, None) | |
43 else: | |
44 return re_match.group(2, 3) | |
45 | |
46 | |
47 def _strip_if_not_none(txt): | |
48 return txt.strip() if txt is not None else '' | |
49 | |
50 | |
51 def _go_rss(elementTree): | |
52 title = _strip_if_not_none(elementTree.find("channel/title").text) | |
53 link = elementTree.find("channel/link").text | |
54 | |
55 items = [] | |
56 | |
57 for i in elementTree.findall("channel/item")[:MAX_ITEMS]: | |
58 it_title = _strip_if_not_none(i.find("title").text) | |
59 it_link = i.find("link").text | |
60 | |
61 items.append((it_title, it_link)) | |
62 | |
63 return (title, link, items) | |
64 | |
65 | |
66 def _go_atom(elementTree): | |
67 ns = "http://www.w3.org/2005/Atom" | |
68 | |
69 title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) | |
70 link = '' | |
71 | |
72 links = elementTree.findall("{%s}link" % ns) | |
73 for i in links: | |
74 if len(links) == 1 or i.get("rel") == "alternate": | |
75 link = i.get("href") | |
76 break | |
77 | |
78 items = [] | |
79 | |
80 for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: | |
81 it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) | |
82 it_link = '' | |
83 | |
84 it_links = i.findall("{%s}link" % ns) | |
85 for j in it_links: | |
86 if len(it_links) == 1 or j.get("rel") == "alternate": | |
87 it_link = j.get("href") | |
88 break | |
89 | |
90 items.append((it_title, it_link)) | |
91 | |
92 return (title, link, items) | |
93 | |
94 | |
95 def _go_purl_rss(elementTree): | |
96 ns = "http://purl.org/rss/1.0/" | |
97 | |
98 title = _strip_if_not_none(elementTree.find("{%s}channel/{%s}title" % (ns, ns)).text) | |
99 link = elementTree.find("{%s}channel/{%s}link" % (ns, ns)).text | |
100 | |
101 items = [] | |
102 | |
103 for i in elementTree.findall("{%s}item" % ns)[:MAX_ITEMS]: | |
104 it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) | |
105 it_link = i.find("{%s}link" % ns).text | |
106 | |
107 items.append((it_title, it_link)) | |
108 | |
109 return (title, link, items) | |
110 | |
111 | |
112 _STRIP_HTML_RE = re.compile(r"<.*?>") | |
113 | |
114 def _strip_html(txt): | |
115 return html.unescape(_STRIP_HTML_RE.sub('', txt)) | |
116 | |
117 | |
118 def _to_html(dtnow, docstruct): | |
119 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") | |
120 page_title = "myrss -- %s" % datetime_str | |
121 | |
122 root = HTML("html") | |
123 | |
124 header = root.head | |
125 header.meta(name="viewport", content="width=device-width, initial-scale=1") | |
126 header.title(page_title) | |
127 header.link(rel="stylesheet", type="text/css", href="static/index.css") | |
128 | |
129 body = root.body | |
130 body.h1(page_title) | |
131 | |
132 link_z = 0 | |
133 | |
134 for feed in docstruct: | |
135 if feed is None: | |
136 continue | |
137 | |
138 (title, link, items) = feed | |
139 | |
140 logging.debug("title: %s", title) | |
141 body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) | |
142 link_z += 1 | |
143 p = body.p | |
144 | |
145 for (i, (it_title, it_link)) in enumerate(items): | |
146 if i > 0: | |
147 p += " - " | |
148 | |
149 if not it_title: | |
150 it_title = "(missing title)" | |
151 if it_link is not None: | |
152 p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) | |
153 else: | |
154 p += _strip_html(it_title) | |
155 | |
156 link_z += 1 | |
157 | |
158 dtdelta = datetime.datetime.now() - dtnow | |
159 root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug") | |
160 | |
161 return str(root) | |
162 | |
163 | |
164 def _fetch_url(url): | |
165 try: | |
166 logging.info("processing %s" % url) | |
167 feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT) | |
168 except urllib.error.HTTPError as e: | |
169 logging.info("(%s) %s" % (url, e)) | |
170 return None | |
171 | |
172 return str(feed.read(), encoding="utf-8") | |
173 | |
174 | |
175 def _filter_feed(feed): | |
176 ret = feed | |
177 | |
178 filter_out = ["\x16"] | |
179 for i in filter_out: | |
180 ret = ret.replace(i, "") | |
181 | |
182 return ret | |
183 | |
184 | |
185 def _process_feed(feed): | |
186 ret = None | |
187 | |
188 feed_sio = io.StringIO(feed) | |
189 elementTree = xml.etree.ElementTree.parse(feed_sio) | |
190 root = elementTree.getroot() | |
191 | |
192 parsed_root_tag = _parse_root_tag(root.tag) | |
193 | |
194 if parsed_root_tag == (None, "rss"): | |
195 version = float(root.get("version", 0.0)) | |
196 if version >= 2.0: | |
197 ret = _go_rss(elementTree) | |
198 else: | |
199 raise NotImplementedError("Unsupported rss version") | |
200 elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): | |
201 ret = _go_atom(elementTree) | |
202 elif parsed_root_tag == ("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"): | |
203 ret = _go_purl_rss(elementTree) | |
204 else: | |
205 raise NotImplementedError("Unknown root tag") | |
206 | |
207 return ret | |
208 | |
209 | |
210 class WorkerThread(threading.Thread): | |
211 def __init__(self, *args, **kwargs): | |
212 self._input_queue = kwargs.pop("input_queue") | |
213 self._output_queue = kwargs.pop("output_queue") | |
214 threading.Thread.__init__(self, *args, **kwargs) | |
215 self.daemon = True | |
216 | |
217 def run(self): | |
218 while True: | |
219 (idx, url) = self._input_queue.get() | |
220 docfeed = None | |
221 try: | |
222 feed = _fetch_url(url) | |
223 if feed is not None: | |
224 docfeed = _process_feed(_filter_feed(feed)) | |
225 except Exception as e: | |
226 logging.info("(%s) exception: (%s) %s" % (url, type(e), e)) | |
227 self._output_queue.put((idx, docfeed)) | |
228 | |
229 | |
230 def main(input_queue, output_queue, lock): | |
231 ret = '' | |
232 | |
233 with lock: | |
234 logging.debug("main() started") | |
235 epoch_now = time.time() | |
236 dtnow = datetime.datetime.fromtimestamp(epoch_now) | |
237 | |
238 if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): | |
239 with open(CACHE_HTML_FILE) as cache_html_file: | |
240 ret = cache_html_file.read() | |
241 | |
242 else: | |
243 with open(FEEDS_FILE) as feeds_file: | |
244 feedlines = feeds_file.readlines() | |
245 | |
246 docstruct = [None]*len(feedlines) | |
247 num_input = 0 | |
248 for (i, l) in enumerate(feedlines): | |
249 if l[0] != '#': | |
250 l = l.strip() | |
251 input_queue.put((i, l)) | |
252 num_input += 1 | |
253 | |
254 for _ in range(num_input): | |
255 (idx, docfeed) = output_queue.get() | |
256 docstruct[idx] = docfeed | |
257 | |
258 ret = _to_html(dtnow, docstruct) | |
259 | |
260 with open(CACHE_HTML_FILE, 'w') as cache_html_file: | |
261 cache_html_file.write(ret) | |
262 logging.debug("main() ended") | |
263 | |
264 return ret | |
265 | |
266 | |
267 class MyRssApp: | |
268 def __init__(self): | |
269 logging.debug("MyRssApp.__init__() called") | |
270 self._iq = queue.Queue(MAX_THREADS) | |
271 self._oq = queue.Queue(MAX_THREADS) | |
272 self._main_lock = threading.Lock() | |
273 | |
274 for i in range(MAX_THREADS): | |
275 logging.debug("Starting thread: %d" % i) | |
276 WorkerThread(input_queue=self._iq, output_queue=self._oq).start() | |
277 | |
278 # Raw WSGI | |
279 def __call__(self, environ, start_response): | |
280 response_code = "500 Internal Server Error" | |
281 response_type = "text/plain; charset=UTF-8" | |
282 | |
283 try: | |
284 response_body = main(self._iq, self._oq, self._main_lock) | |
285 response_code = "200 OK" | |
286 response_type = "text/html; charset=UTF-8" | |
287 except: | |
288 response_body = traceback.format_exc() | |
289 | |
290 response_headers = [ | |
291 ("Content-Type", response_type), | |
292 ("Content-Length", str(len(response_body))), | |
293 ] | |
294 start_response(response_code, response_headers) | |
295 | |
296 return [bytes(response_body, encoding="utf-8")] | |
297 | |
298 def call(self): | |
299 return main(self._iq, self._oq, self._main_lock) |