Mercurial > hg > index.fcgi > www > www-1
diff myrss/myrss_app.py @ 43:df6a1a347584
rename myrss_parser.py to myrss_app.py
author | paulo |
---|---|
date | Tue, 05 Feb 2013 00:01:49 -0800 |
parents | myrss/myrss_parser.py@a1456ecd25b9 |
children | c673e9e9c4ca |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/myrss/myrss_app.py Tue Feb 05 00:01:49 2013 -0800 1.3 @@ -0,0 +1,213 @@ 1.4 +import os 1.5 +import sys 1.6 +import re 1.7 +import urllib2 1.8 +import threading 1.9 +import Queue 1.10 +import datetime 1.11 +import time 1.12 +import logging 1.13 +logging.basicConfig(level=logging.INFO) 1.14 + 1.15 +import html 1.16 +import xml.etree.ElementTree 1.17 + 1.18 + 1.19 +FEEDS_FILE = "FEEDS" 1.20 +CACHE_HTML_FILE = "__cache__.html" 1.21 + 1.22 +#CACHE_LIFE = 1200 # [seconds] 1.23 +CACHE_LIFE = 30 # [seconds] 1.24 +MAX_ITEMS = 30 1.25 +MAX_LINK_Z = 4 1.26 +MAX_THREADS = 20 1.27 + 1.28 + 1.29 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") 1.30 + 1.31 +def _parse_root_tag(root_tag): 1.32 + re_match = _PARSE_ROOT_TAG_RE.match(root_tag) 1.33 + 1.34 + if re_match is None: 1.35 + return (None, None) 1.36 + else: 1.37 + return re_match.group(2, 3) 1.38 + 1.39 + 1.40 +def _go_rss(elementTree): 1.41 + title = elementTree.find("channel/title").text.strip() 1.42 + link = elementTree.find("channel/link").text 1.43 + 1.44 + items = [] 1.45 + 1.46 + for i in elementTree.findall("channel/item")[:MAX_ITEMS]: 1.47 + it_title = i.find("title").text.strip() 1.48 + it_link = i.find("link").text 1.49 + 1.50 + items.append((it_title, it_link)) 1.51 + 1.52 + return (title, link, items) 1.53 + 1.54 + 1.55 +def _go_atom(elementTree): 1.56 + ns = "http://www.w3.org/2005/Atom" 1.57 + 1.58 + title = elementTree.find("{%s}title" % ns).text.strip() 1.59 + link = '' 1.60 + 1.61 + for i in elementTree.findall("{%s}link" % ns): 1.62 + if i.get("type") == "text/html" and i.get("rel") == "alternate": 1.63 + link = i.get("href") 1.64 + break 1.65 + 1.66 + items = [] 1.67 + 1.68 + for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: 1.69 + it_title = i.find("{%s}title" % ns).text.strip() 1.70 + it_link = '' 1.71 + 1.72 + for j in i.findall("{%s}link" % ns): 1.73 + if j.get("type") == "text/html" and j.get("rel") == "alternate": 1.74 + it_link = j.get("href") 1.75 + break 1.76 + 1.77 + items.append((it_title, it_link)) 1.78 + 1.79 + return (title, link, items) 1.80 + 1.81 + 1.82 +def _to_html(dtnow, docstruct): 1.83 + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 1.84 + page_title = "myrss -- %s" % datetime_str 1.85 + 1.86 + root = html.HTML("html") 1.87 + 1.88 + header = root.header 1.89 + header.title(page_title) 1.90 + header.link(rel="stylesheet", type="text/css", href="index.css") 1.91 + 1.92 + body = root.body 1.93 + body.h1(page_title) 1.94 + 1.95 + link_z = 0 1.96 + 1.97 + for feed in docstruct: 1.98 + if feed is None: 1.99 + continue 1.100 + 1.101 + (title, link, items) = feed 1.102 + 1.103 + body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.104 + link_z += 1 1.105 + p = body.p 1.106 + 1.107 + for (i, (it_title, it_link)) in enumerate(items): 1.108 + if i > 0: 1.109 + p += " - " 1.110 + 1.111 + p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.112 + link_z += 1 1.113 + 1.114 + return unicode(root).encode("utf-8") 1.115 + 1.116 + 1.117 +def _process_url(url): 1.118 + ret = None 1.119 + 1.120 + try: 1.121 + logging.info("processing %s" % url) 1.122 + feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) 1.123 + except urllib2.HTTPError as e: 1.124 + logging.info("(%s) %s" % (url, e)) 1.125 + return ret 1.126 + 1.127 + elementTree = xml.etree.ElementTree.parse(feed) 1.128 + root = elementTree.getroot() 1.129 + 1.130 + parsed_root_tag = _parse_root_tag(root.tag) 1.131 + 1.132 + if parsed_root_tag == (None, "rss"): 1.133 + version = float(root.get("version", 0.0)) 1.134 + if version >= 2.0: 1.135 + ret = _go_rss(elementTree) 1.136 + else: 1.137 + raise NotImplementedError("Unsupported rss version") 1.138 + elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): 1.139 + ret = _go_atom(elementTree) 1.140 + else: 1.141 + raise NotImplementedError("Unknown root tag") 1.142 + 1.143 + return ret 1.144 + 1.145 + 1.146 +class WorkerThread(threading.Thread): 1.147 + def __init__(self, *args, **kwargs): 1.148 + self._input_queue = kwargs.pop("input_queue") 1.149 + self._output_queue = kwargs.pop("output_queue") 1.150 + threading.Thread.__init__(self, *args, **kwargs) 1.151 + self.daemon = True 1.152 + 1.153 + def run(self): 1.154 + while True: 1.155 + (idx, url) = self._input_queue.get() 1.156 + docfeed = None 1.157 + try: 1.158 + docfeed = _process_url(url) 1.159 + except Exception as e: 1.160 + logging.info("(%s) exception: %s" % (url, e)) 1.161 + self._output_queue.put((idx, docfeed)) 1.162 + 1.163 + 1.164 +def main(input_queue, output_queue): 1.165 + ret = '' 1.166 + 1.167 + epoch_now = time.time() 1.168 + dtnow = datetime.datetime.fromtimestamp(epoch_now) 1.169 + 1.170 + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): 1.171 + with open(CACHE_HTML_FILE) as cache_html_file: 1.172 + ret = cache_html_file.read() 1.173 + 1.174 + else: 1.175 + with open(FEEDS_FILE) as feeds_file: 1.176 + feedlines = feeds_file.readlines() 1.177 + 1.178 + docstruct = [None]*len(feedlines) 1.179 + num_input = 0 1.180 + for (i, l) in enumerate(feedlines): 1.181 + if l[0] != '#': 1.182 + l = l.strip() 1.183 + input_queue.put((i, l)) 1.184 + num_input += 1 1.185 + 1.186 + for _ in range(num_input): 1.187 + (idx, docfeed) = output_queue.get() 1.188 + docstruct[idx] = docfeed 1.189 + 1.190 + ret = _to_html(dtnow, docstruct) 1.191 + 1.192 + with open(CACHE_HTML_FILE, 'w') as cache_html_file: 1.193 + cache_html_file.write(ret) 1.194 + 1.195 + return ret 1.196 + 1.197 + 1.198 +class MyRssApp: 1.199 + def __init__(self): 1.200 + self._iq = Queue.Queue(MAX_THREADS) 1.201 + self._oq = Queue.Queue(MAX_THREADS) 1.202 + 1.203 + for _ in range(MAX_THREADS): 1.204 + WorkerThread(input_queue=self._iq, output_queue=self._oq).start() 1.205 + 1.206 + def __call__(self, environ, start_response): 1.207 + response_body = main(self._iq, self._oq) 1.208 + response_headers = [ 1.209 + ("Content-Type", "text/html"), 1.210 + ("Content-Length", str(len(response_body))), 1.211 + ] 1.212 + start_response("200 OK", response_headers) 1.213 + 1.214 + return [response_body] 1.215 + 1.216 +