Mercurial > hg > index.fcgi > www > www-1
changeset 47:315afeb47e52
myrss: fix handling embedded HTML tags and special characters; add myrss_test_feed.py
author | paulo |
---|---|
date | Wed, 13 Feb 2013 00:11:58 -0800 |
parents | aca02ce71274 |
children | 50de1845520f |
files | myrss/myrss_app.py myrss/myrss_test_feed.py |
diffstat | 2 files changed, 43 insertions(+), 14 deletions(-) [+] |
line diff
1.1 --- a/myrss/myrss_app.py Tue Feb 12 00:43:11 2013 -0700 1.2 +++ b/myrss/myrss_app.py Wed Feb 13 00:11:58 2013 -0800 1.3 @@ -6,18 +6,21 @@ 1.4 import Queue 1.5 import datetime 1.6 import time 1.7 + 1.8 import logging 1.9 logging.basicConfig(level=logging.INFO) 1.10 1.11 +import xml.etree.ElementTree 1.12 +import HTMLParser 1.13 + 1.14 import html 1.15 -import xml.etree.ElementTree 1.16 1.17 1.18 FEEDS_FILE = "FEEDS" 1.19 CACHE_HTML_FILE = "__cache__.html" 1.20 1.21 CACHE_LIFE = 1200 # [seconds] 1.22 -MAX_ITEMS = 30 1.23 +MAX_ITEMS = 50 1.24 MAX_LINK_Z = 4 1.25 MAX_THREADS = 20 1.26 URLOPEN_TIMEOUT = 60 # [seconds] 1.27 @@ -34,14 +37,18 @@ 1.28 return re_match.group(2, 3) 1.29 1.30 1.31 +def _strip_if_not_none(txt): 1.32 + return txt.strip() if txt is not None else '' 1.33 + 1.34 + 1.35 def _go_rss(elementTree): 1.36 - title = elementTree.find("channel/title").text.strip() 1.37 + title = _strip_if_not_none(elementTree.find("channel/title").text) 1.38 link = elementTree.find("channel/link").text 1.39 1.40 items = [] 1.41 1.42 for i in elementTree.findall("channel/item")[:MAX_ITEMS]: 1.43 - it_title = i.find("title").text.strip() 1.44 + it_title = _strip_if_not_none(i.find("title").text) 1.45 it_link = i.find("link").text 1.46 1.47 items.append((it_title, it_link)) 1.48 @@ -52,7 +59,7 @@ 1.49 def _go_atom(elementTree): 1.50 ns = "http://www.w3.org/2005/Atom" 1.51 1.52 - title = elementTree.find("{%s}title" % ns).text.strip() 1.53 + title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) 1.54 link = '' 1.55 1.56 for i in elementTree.findall("{%s}link" % ns): 1.57 @@ -63,7 +70,7 @@ 1.58 items = [] 1.59 1.60 for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: 1.61 - it_title = i.find("{%s}title" % ns).text.strip() 1.62 + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) 1.63 it_link = '' 1.64 1.65 for j in i.findall("{%s}link" % ns): 1.66 @@ -76,6 +83,13 @@ 1.67 return (title, link, items) 1.68 1.69 1.70 +_STRIP_HTML_RE = re.compile(r"<.*?>") 1.71 +_htmlParser = HTMLParser.HTMLParser() 1.72 + 1.73 +def _strip_html(txt): 1.74 + return _htmlParser.unescape(_STRIP_HTML_RE.sub('', txt)) 1.75 + 1.76 + 1.77 def _to_html(dtnow, docstruct): 1.78 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 1.79 page_title = "myrss -- %s" % datetime_str 1.80 @@ -97,7 +111,7 @@ 1.81 1.82 (title, link, items) = feed 1.83 1.84 - body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.85 + body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.86 link_z += 1 1.87 p = body.p 1.88 1.89 @@ -105,7 +119,7 @@ 1.90 if i > 0: 1.91 p += " - " 1.92 1.93 - p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.94 + p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.95 link_z += 1 1.96 1.97 dtdelta = datetime.datetime.now() - dtnow 1.98 @@ -114,15 +128,19 @@ 1.99 return unicode(root).encode("utf-8") 1.100 1.101 1.102 -def _process_url(url): 1.103 - ret = None 1.104 - 1.105 +def _fetch_url(url): 1.106 try: 1.107 logging.info("processing %s" % url) 1.108 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}), timeout=URLOPEN_TIMEOUT) 1.109 except urllib2.HTTPError as e: 1.110 logging.info("(%s) %s" % (url, e)) 1.111 - return ret 1.112 + return None 1.113 + 1.114 + return feed 1.115 + 1.116 + 1.117 +def _process_feed(feed): 1.118 + ret = None 1.119 1.120 elementTree = xml.etree.ElementTree.parse(feed) 1.121 root = elementTree.getroot() 1.122 @@ -155,7 +173,9 @@ 1.123 (idx, url) = self._input_queue.get() 1.124 docfeed = None 1.125 try: 1.126 - docfeed = _process_url(url) 1.127 + feed = _fetch_url(url) 1.128 + if feed is not None: 1.129 + docfeed = _process_feed(feed) 1.130 except Exception as e: 1.131 logging.info("(%s) exception: %s" % (url, e)) 1.132 self._output_queue.put((idx, docfeed)) 1.133 @@ -208,7 +228,7 @@ 1.134 def __call__(self, environ, start_response): 1.135 response_body = main(self._iq, self._oq, self._main_lock) 1.136 response_headers = [ 1.137 - ("Content-Type", "text/html"), 1.138 + ("Content-Type", "text/html; charset=UTF-8"), 1.139 ("Content-Length", str(len(response_body))), 1.140 ] 1.141 start_response("200 OK", response_headers)
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/myrss/myrss_test_feed.py Wed Feb 13 00:11:58 2013 -0800 2.3 @@ -0,0 +1,9 @@ 2.4 +import datetime 2.5 + 2.6 +import myrss_app 2.7 + 2.8 + 2.9 +x = myrss_app._process_feed(open("rottentomatoes.rss.xml")) 2.10 +y = myrss_app._to_html(datetime.datetime.now(), [x]) 2.11 + 2.12 +print y