changeset 47:315afeb47e52

myrss: fix handling embedded HTML tags and special characters; add myrss_test_feed.py
author paulo
date Wed, 13 Feb 2013 00:11:58 -0800
parents aca02ce71274
children 50de1845520f
files myrss/myrss_app.py myrss/myrss_test_feed.py
diffstat 2 files changed, 43 insertions(+), 14 deletions(-) [+]
line diff
     1.1 --- a/myrss/myrss_app.py	Tue Feb 12 00:43:11 2013 -0700
     1.2 +++ b/myrss/myrss_app.py	Wed Feb 13 00:11:58 2013 -0800
     1.3 @@ -6,18 +6,21 @@
     1.4  import Queue
     1.5  import datetime
     1.6  import time
     1.7 +
     1.8  import logging
     1.9  logging.basicConfig(level=logging.INFO)
    1.10  
    1.11 +import xml.etree.ElementTree 
    1.12 +import HTMLParser
    1.13 +
    1.14  import html
    1.15 -import xml.etree.ElementTree 
    1.16  
    1.17  
    1.18  FEEDS_FILE = "FEEDS"
    1.19  CACHE_HTML_FILE = "__cache__.html"
    1.20  
    1.21  CACHE_LIFE = 1200 # [seconds]
    1.22 -MAX_ITEMS = 30
    1.23 +MAX_ITEMS = 50
    1.24  MAX_LINK_Z = 4
    1.25  MAX_THREADS = 20
    1.26  URLOPEN_TIMEOUT = 60 # [seconds]
    1.27 @@ -34,14 +37,18 @@
    1.28  		return re_match.group(2, 3)
    1.29  	
    1.30  
    1.31 +def _strip_if_not_none(txt):
    1.32 +	return txt.strip() if txt is not None else ''
    1.33 +
    1.34 +
    1.35  def _go_rss(elementTree):
    1.36 -	title = elementTree.find("channel/title").text.strip()
    1.37 +	title = _strip_if_not_none(elementTree.find("channel/title").text)
    1.38  	link = elementTree.find("channel/link").text
    1.39  
    1.40  	items = []
    1.41  
    1.42  	for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
    1.43 -		it_title = i.find("title").text.strip()
    1.44 +		it_title = _strip_if_not_none(i.find("title").text)
    1.45  		it_link = i.find("link").text
    1.46  
    1.47  		items.append((it_title, it_link))
    1.48 @@ -52,7 +59,7 @@
    1.49  def _go_atom(elementTree):
    1.50  	ns = "http://www.w3.org/2005/Atom"
    1.51  
    1.52 -	title = elementTree.find("{%s}title" % ns).text.strip()
    1.53 +	title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text)
    1.54  	link = ''
    1.55  
    1.56  	for i in elementTree.findall("{%s}link" % ns):
    1.57 @@ -63,7 +70,7 @@
    1.58  	items = []
    1.59  
    1.60  	for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
    1.61 -		it_title = i.find("{%s}title" % ns).text.strip()
    1.62 +		it_title = _strip_if_not_none(i.find("{%s}title" % ns).text)
    1.63  		it_link = ''
    1.64  		
    1.65  		for j in i.findall("{%s}link" % ns):
    1.66 @@ -76,6 +83,13 @@
    1.67  	return (title, link, items)
    1.68  
    1.69  
    1.70 +_STRIP_HTML_RE = re.compile(r"<.*?>")
    1.71 +_htmlParser = HTMLParser.HTMLParser()
    1.72 +
    1.73 +def _strip_html(txt):
    1.74 +	return _htmlParser.unescape(_STRIP_HTML_RE.sub('', txt))
    1.75 +	
    1.76 +
    1.77  def _to_html(dtnow, docstruct):
    1.78  	datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
    1.79  	page_title = "myrss -- %s" % datetime_str
    1.80 @@ -97,7 +111,7 @@
    1.81  
    1.82  		(title, link, items) = feed
    1.83  
    1.84 -		body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.85 +		body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.86  		link_z += 1
    1.87  		p = body.p
    1.88  
    1.89 @@ -105,7 +119,7 @@
    1.90  			if i > 0:
    1.91  				p += " - "
    1.92  
    1.93 -			p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.94 +			p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.95  			link_z += 1
    1.96  
    1.97  	dtdelta = datetime.datetime.now() - dtnow
    1.98 @@ -114,15 +128,19 @@
    1.99  	return unicode(root).encode("utf-8")
   1.100  
   1.101  
   1.102 -def _process_url(url):
   1.103 -	ret = None
   1.104 -
   1.105 +def _fetch_url(url):
   1.106  	try:
   1.107  		logging.info("processing %s" % url)
   1.108  		feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}), timeout=URLOPEN_TIMEOUT)
   1.109  	except urllib2.HTTPError as e:
   1.110  		logging.info("(%s) %s" % (url, e))
   1.111 -		return ret
   1.112 +		return None
   1.113 +
   1.114 +	return feed
   1.115 +
   1.116 +
   1.117 +def _process_feed(feed):
   1.118 +	ret = None
   1.119  
   1.120  	elementTree = xml.etree.ElementTree.parse(feed)
   1.121  	root = elementTree.getroot()
   1.122 @@ -155,7 +173,9 @@
   1.123  			(idx, url) = self._input_queue.get()
   1.124  			docfeed = None
   1.125  			try:
   1.126 -				docfeed = _process_url(url)
   1.127 +				feed = _fetch_url(url)
   1.128 +				if feed is not None:
   1.129 +					docfeed = _process_feed(feed)
   1.130  			except Exception as e:
   1.131  				logging.info("(%s) exception: %s" % (url, e))
   1.132  			self._output_queue.put((idx, docfeed))
   1.133 @@ -208,7 +228,7 @@
   1.134  	def __call__(self, environ, start_response):
   1.135  		response_body = main(self._iq, self._oq, self._main_lock)
   1.136  		response_headers = [
   1.137 -			("Content-Type", "text/html"),
   1.138 +			("Content-Type", "text/html; charset=UTF-8"),
   1.139  			("Content-Length", str(len(response_body))),
   1.140  		]
   1.141  		start_response("200 OK", response_headers)
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/myrss/myrss_test_feed.py	Wed Feb 13 00:11:58 2013 -0800
     2.3 @@ -0,0 +1,9 @@
     2.4 +import datetime
     2.5 +
     2.6 +import myrss_app
     2.7 +
     2.8 +
     2.9 +x = myrss_app._process_feed(open("rottentomatoes.rss.xml"))
    2.10 +y = myrss_app._to_html(datetime.datetime.now(), [x])
    2.11 +
    2.12 +print y