# HG changeset patch # User paulo # Date 1360743118 28800 # Node ID 315afeb47e52015a4f1c9fdda086fd2ae379f0ec # Parent aca02ce71274497adf4d3b2b32adfd960b8ba1a6 myrss: fix handling embedded HTML tags and special characters; add myrss_test_feed.py diff -r aca02ce71274 -r 315afeb47e52 myrss/myrss_app.py --- a/myrss/myrss_app.py Tue Feb 12 00:43:11 2013 -0700 +++ b/myrss/myrss_app.py Wed Feb 13 00:11:58 2013 -0800 @@ -6,18 +6,21 @@ import Queue import datetime import time + import logging logging.basicConfig(level=logging.INFO) +import xml.etree.ElementTree +import HTMLParser + import html -import xml.etree.ElementTree FEEDS_FILE = "FEEDS" CACHE_HTML_FILE = "__cache__.html" CACHE_LIFE = 1200 # [seconds] -MAX_ITEMS = 30 +MAX_ITEMS = 50 MAX_LINK_Z = 4 MAX_THREADS = 20 URLOPEN_TIMEOUT = 60 # [seconds] @@ -34,14 +37,18 @@ return re_match.group(2, 3) +def _strip_if_not_none(txt): + return txt.strip() if txt is not None else '' + + def _go_rss(elementTree): - title = elementTree.find("channel/title").text.strip() + title = _strip_if_not_none(elementTree.find("channel/title").text) link = elementTree.find("channel/link").text items = [] for i in elementTree.findall("channel/item")[:MAX_ITEMS]: - it_title = i.find("title").text.strip() + it_title = _strip_if_not_none(i.find("title").text) it_link = i.find("link").text items.append((it_title, it_link)) @@ -52,7 +59,7 @@ def _go_atom(elementTree): ns = "http://www.w3.org/2005/Atom" - title = elementTree.find("{%s}title" % ns).text.strip() + title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) link = '' for i in elementTree.findall("{%s}link" % ns): @@ -63,7 +70,7 @@ items = [] for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: - it_title = i.find("{%s}title" % ns).text.strip() + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) it_link = '' for j in i.findall("{%s}link" % ns): @@ -76,6 +83,13 @@ return (title, link, items) +_STRIP_HTML_RE = re.compile(r"<.*?>") +_htmlParser = HTMLParser.HTMLParser() + +def _strip_html(txt): + return _htmlParser.unescape(_STRIP_HTML_RE.sub('', txt)) + + def _to_html(dtnow, docstruct): datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") page_title = "myrss -- %s" % datetime_str @@ -97,7 +111,7 @@ (title, link, items) = feed - body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) + body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) link_z += 1 p = body.p @@ -105,7 +119,7 @@ if i > 0: p += " - " - p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) + p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) link_z += 1 dtdelta = datetime.datetime.now() - dtnow @@ -114,15 +128,19 @@ return unicode(root).encode("utf-8") -def _process_url(url): - ret = None - +def _fetch_url(url): try: logging.info("processing %s" % url) feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}), timeout=URLOPEN_TIMEOUT) except urllib2.HTTPError as e: logging.info("(%s) %s" % (url, e)) - return ret + return None + + return feed + + +def _process_feed(feed): + ret = None elementTree = xml.etree.ElementTree.parse(feed) root = elementTree.getroot() @@ -155,7 +173,9 @@ (idx, url) = self._input_queue.get() docfeed = None try: - docfeed = _process_url(url) + feed = _fetch_url(url) + if feed is not None: + docfeed = _process_feed(feed) except Exception as e: logging.info("(%s) exception: %s" % (url, e)) self._output_queue.put((idx, docfeed)) @@ -208,7 +228,7 @@ def __call__(self, environ, start_response): response_body = main(self._iq, self._oq, self._main_lock) response_headers = [ - ("Content-Type", "text/html"), + ("Content-Type", "text/html; charset=UTF-8"), ("Content-Length", str(len(response_body))), ] start_response("200 OK", response_headers) diff -r aca02ce71274 -r 315afeb47e52 myrss/myrss_test_feed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss/myrss_test_feed.py Wed Feb 13 00:11:58 2013 -0800 @@ -0,0 +1,9 @@ +import datetime + +import myrss_app + + +x = myrss_app._process_feed(open("rottentomatoes.rss.xml")) +y = myrss_app._to_html(datetime.datetime.now(), [x]) + +print y