Mercurial > hg > index.fcgi > www > www-1
comparison myrss/myrss_app.py @ 47:315afeb47e52
myrss: fix handling embedded HTML tags and special characters; add myrss_test_feed.py
author | paulo |
---|---|
date | Wed, 13 Feb 2013 00:11:58 -0800 |
parents | aca02ce71274 |
children | 66a232bae83c |
comparison
equal
deleted
inserted
replaced
2:d39f15d9bc72 | 3:f2613d9a3de6 |
---|---|
4 import urllib2 | 4 import urllib2 |
5 import threading | 5 import threading |
6 import Queue | 6 import Queue |
7 import datetime | 7 import datetime |
8 import time | 8 import time |
9 | |
9 import logging | 10 import logging |
10 logging.basicConfig(level=logging.INFO) | 11 logging.basicConfig(level=logging.INFO) |
11 | 12 |
13 import xml.etree.ElementTree | |
14 import HTMLParser | |
15 | |
12 import html | 16 import html |
13 import xml.etree.ElementTree | |
14 | 17 |
15 | 18 |
16 FEEDS_FILE = "FEEDS" | 19 FEEDS_FILE = "FEEDS" |
17 CACHE_HTML_FILE = "__cache__.html" | 20 CACHE_HTML_FILE = "__cache__.html" |
18 | 21 |
19 CACHE_LIFE = 1200 # [seconds] | 22 CACHE_LIFE = 1200 # [seconds] |
20 MAX_ITEMS = 30 | 23 MAX_ITEMS = 50 |
21 MAX_LINK_Z = 4 | 24 MAX_LINK_Z = 4 |
22 MAX_THREADS = 20 | 25 MAX_THREADS = 20 |
23 URLOPEN_TIMEOUT = 60 # [seconds] | 26 URLOPEN_TIMEOUT = 60 # [seconds] |
24 | 27 |
25 | 28 |
32 return (None, None) | 35 return (None, None) |
33 else: | 36 else: |
34 return re_match.group(2, 3) | 37 return re_match.group(2, 3) |
35 | 38 |
36 | 39 |
40 def _strip_if_not_none(txt): | |
41 return txt.strip() if txt is not None else '' | |
42 | |
43 | |
37 def _go_rss(elementTree): | 44 def _go_rss(elementTree): |
38 title = elementTree.find("channel/title").text.strip() | 45 title = _strip_if_not_none(elementTree.find("channel/title").text) |
39 link = elementTree.find("channel/link").text | 46 link = elementTree.find("channel/link").text |
40 | 47 |
41 items = [] | 48 items = [] |
42 | 49 |
43 for i in elementTree.findall("channel/item")[:MAX_ITEMS]: | 50 for i in elementTree.findall("channel/item")[:MAX_ITEMS]: |
44 it_title = i.find("title").text.strip() | 51 it_title = _strip_if_not_none(i.find("title").text) |
45 it_link = i.find("link").text | 52 it_link = i.find("link").text |
46 | 53 |
47 items.append((it_title, it_link)) | 54 items.append((it_title, it_link)) |
48 | 55 |
49 return (title, link, items) | 56 return (title, link, items) |
50 | 57 |
51 | 58 |
52 def _go_atom(elementTree): | 59 def _go_atom(elementTree): |
53 ns = "http://www.w3.org/2005/Atom" | 60 ns = "http://www.w3.org/2005/Atom" |
54 | 61 |
55 title = elementTree.find("{%s}title" % ns).text.strip() | 62 title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) |
56 link = '' | 63 link = '' |
57 | 64 |
58 for i in elementTree.findall("{%s}link" % ns): | 65 for i in elementTree.findall("{%s}link" % ns): |
59 if i.get("type") == "text/html" and i.get("rel") == "alternate": | 66 if i.get("type") == "text/html" and i.get("rel") == "alternate": |
60 link = i.get("href") | 67 link = i.get("href") |
61 break | 68 break |
62 | 69 |
63 items = [] | 70 items = [] |
64 | 71 |
65 for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: | 72 for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: |
66 it_title = i.find("{%s}title" % ns).text.strip() | 73 it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) |
67 it_link = '' | 74 it_link = '' |
68 | 75 |
69 for j in i.findall("{%s}link" % ns): | 76 for j in i.findall("{%s}link" % ns): |
70 if j.get("type") == "text/html" and j.get("rel") == "alternate": | 77 if j.get("type") == "text/html" and j.get("rel") == "alternate": |
71 it_link = j.get("href") | 78 it_link = j.get("href") |
74 items.append((it_title, it_link)) | 81 items.append((it_title, it_link)) |
75 | 82 |
76 return (title, link, items) | 83 return (title, link, items) |
77 | 84 |
78 | 85 |
86 _STRIP_HTML_RE = re.compile(r"<.*?>") | |
87 _htmlParser = HTMLParser.HTMLParser() | |
88 | |
89 def _strip_html(txt): | |
90 return _htmlParser.unescape(_STRIP_HTML_RE.sub('', txt)) | |
91 | |
92 | |
79 def _to_html(dtnow, docstruct): | 93 def _to_html(dtnow, docstruct): |
80 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") | 94 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") |
81 page_title = "myrss -- %s" % datetime_str | 95 page_title = "myrss -- %s" % datetime_str |
82 | 96 |
83 root = html.HTML("html") | 97 root = html.HTML("html") |
95 if feed is None: | 109 if feed is None: |
96 continue | 110 continue |
97 | 111 |
98 (title, link, items) = feed | 112 (title, link, items) = feed |
99 | 113 |
100 body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) | 114 body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) |
101 link_z += 1 | 115 link_z += 1 |
102 p = body.p | 116 p = body.p |
103 | 117 |
104 for (i, (it_title, it_link)) in enumerate(items): | 118 for (i, (it_title, it_link)) in enumerate(items): |
105 if i > 0: | 119 if i > 0: |
106 p += " - " | 120 p += " - " |
107 | 121 |
108 p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) | 122 p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) |
109 link_z += 1 | 123 link_z += 1 |
110 | 124 |
111 dtdelta = datetime.datetime.now() - dtnow | 125 dtdelta = datetime.datetime.now() - dtnow |
112 root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug") | 126 root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug") |
113 | 127 |
114 return unicode(root).encode("utf-8") | 128 return unicode(root).encode("utf-8") |
115 | 129 |
116 | 130 |
117 def _process_url(url): | 131 def _fetch_url(url): |
118 ret = None | |
119 | |
120 try: | 132 try: |
121 logging.info("processing %s" % url) | 133 logging.info("processing %s" % url) |
122 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}), timeout=URLOPEN_TIMEOUT) | 134 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}), timeout=URLOPEN_TIMEOUT) |
123 except urllib2.HTTPError as e: | 135 except urllib2.HTTPError as e: |
124 logging.info("(%s) %s" % (url, e)) | 136 logging.info("(%s) %s" % (url, e)) |
125 return ret | 137 return None |
138 | |
139 return feed | |
140 | |
141 | |
142 def _process_feed(feed): | |
143 ret = None | |
126 | 144 |
127 elementTree = xml.etree.ElementTree.parse(feed) | 145 elementTree = xml.etree.ElementTree.parse(feed) |
128 root = elementTree.getroot() | 146 root = elementTree.getroot() |
129 | 147 |
130 parsed_root_tag = _parse_root_tag(root.tag) | 148 parsed_root_tag = _parse_root_tag(root.tag) |
153 def run(self): | 171 def run(self): |
154 while True: | 172 while True: |
155 (idx, url) = self._input_queue.get() | 173 (idx, url) = self._input_queue.get() |
156 docfeed = None | 174 docfeed = None |
157 try: | 175 try: |
158 docfeed = _process_url(url) | 176 feed = _fetch_url(url) |
177 if feed is not None: | |
178 docfeed = _process_feed(feed) | |
159 except Exception as e: | 179 except Exception as e: |
160 logging.info("(%s) exception: %s" % (url, e)) | 180 logging.info("(%s) exception: %s" % (url, e)) |
161 self._output_queue.put((idx, docfeed)) | 181 self._output_queue.put((idx, docfeed)) |
162 | 182 |
163 | 183 |
206 WorkerThread(input_queue=self._iq, output_queue=self._oq).start() | 226 WorkerThread(input_queue=self._iq, output_queue=self._oq).start() |
207 | 227 |
208 def __call__(self, environ, start_response): | 228 def __call__(self, environ, start_response): |
209 response_body = main(self._iq, self._oq, self._main_lock) | 229 response_body = main(self._iq, self._oq, self._main_lock) |
210 response_headers = [ | 230 response_headers = [ |
211 ("Content-Type", "text/html"), | 231 ("Content-Type", "text/html; charset=UTF-8"), |
212 ("Content-Length", str(len(response_body))), | 232 ("Content-Length", str(len(response_body))), |
213 ] | 233 ] |
214 start_response("200 OK", response_headers) | 234 start_response("200 OK", response_headers) |
215 | 235 |
216 return [response_body] | 236 return [response_body] |