Mercurial > hg > index.fcgi > www > www-1
changeset 43:df6a1a347584
rename myrss_parser.py to myrss_app.py
author | paulo |
---|---|
date | Tue, 05 Feb 2013 00:01:49 -0800 |
parents | a1456ecd25b9 |
children | c673e9e9c4ca |
files | myrss/myrss_app.py myrss/myrss_parser.py myrss/myrss_test_server.py |
diffstat | 3 files changed, 215 insertions(+), 215 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/myrss/myrss_app.py Tue Feb 05 00:01:49 2013 -0800 1.3 @@ -0,0 +1,213 @@ 1.4 +import os 1.5 +import sys 1.6 +import re 1.7 +import urllib2 1.8 +import threading 1.9 +import Queue 1.10 +import datetime 1.11 +import time 1.12 +import logging 1.13 +logging.basicConfig(level=logging.INFO) 1.14 + 1.15 +import html 1.16 +import xml.etree.ElementTree 1.17 + 1.18 + 1.19 +FEEDS_FILE = "FEEDS" 1.20 +CACHE_HTML_FILE = "__cache__.html" 1.21 + 1.22 +#CACHE_LIFE = 1200 # [seconds] 1.23 +CACHE_LIFE = 30 # [seconds] 1.24 +MAX_ITEMS = 30 1.25 +MAX_LINK_Z = 4 1.26 +MAX_THREADS = 20 1.27 + 1.28 + 1.29 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") 1.30 + 1.31 +def _parse_root_tag(root_tag): 1.32 + re_match = _PARSE_ROOT_TAG_RE.match(root_tag) 1.33 + 1.34 + if re_match is None: 1.35 + return (None, None) 1.36 + else: 1.37 + return re_match.group(2, 3) 1.38 + 1.39 + 1.40 +def _go_rss(elementTree): 1.41 + title = elementTree.find("channel/title").text.strip() 1.42 + link = elementTree.find("channel/link").text 1.43 + 1.44 + items = [] 1.45 + 1.46 + for i in elementTree.findall("channel/item")[:MAX_ITEMS]: 1.47 + it_title = i.find("title").text.strip() 1.48 + it_link = i.find("link").text 1.49 + 1.50 + items.append((it_title, it_link)) 1.51 + 1.52 + return (title, link, items) 1.53 + 1.54 + 1.55 +def _go_atom(elementTree): 1.56 + ns = "http://www.w3.org/2005/Atom" 1.57 + 1.58 + title = elementTree.find("{%s}title" % ns).text.strip() 1.59 + link = '' 1.60 + 1.61 + for i in elementTree.findall("{%s}link" % ns): 1.62 + if i.get("type") == "text/html" and i.get("rel") == "alternate": 1.63 + link = i.get("href") 1.64 + break 1.65 + 1.66 + items = [] 1.67 + 1.68 + for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: 1.69 + it_title = i.find("{%s}title" % ns).text.strip() 1.70 + it_link = '' 1.71 + 1.72 + for j in i.findall("{%s}link" % ns): 1.73 + if j.get("type") == "text/html" and j.get("rel") == "alternate": 1.74 + it_link = j.get("href") 1.75 + break 1.76 + 1.77 + items.append((it_title, it_link)) 1.78 + 1.79 + return (title, link, items) 1.80 + 1.81 + 1.82 +def _to_html(dtnow, docstruct): 1.83 + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 1.84 + page_title = "myrss -- %s" % datetime_str 1.85 + 1.86 + root = html.HTML("html") 1.87 + 1.88 + header = root.header 1.89 + header.title(page_title) 1.90 + header.link(rel="stylesheet", type="text/css", href="index.css") 1.91 + 1.92 + body = root.body 1.93 + body.h1(page_title) 1.94 + 1.95 + link_z = 0 1.96 + 1.97 + for feed in docstruct: 1.98 + if feed is None: 1.99 + continue 1.100 + 1.101 + (title, link, items) = feed 1.102 + 1.103 + body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.104 + link_z += 1 1.105 + p = body.p 1.106 + 1.107 + for (i, (it_title, it_link)) in enumerate(items): 1.108 + if i > 0: 1.109 + p += " - " 1.110 + 1.111 + p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.112 + link_z += 1 1.113 + 1.114 + return unicode(root).encode("utf-8") 1.115 + 1.116 + 1.117 +def _process_url(url): 1.118 + ret = None 1.119 + 1.120 + try: 1.121 + logging.info("processing %s" % url) 1.122 + feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) 1.123 + except urllib2.HTTPError as e: 1.124 + logging.info("(%s) %s" % (url, e)) 1.125 + return ret 1.126 + 1.127 + elementTree = xml.etree.ElementTree.parse(feed) 1.128 + root = elementTree.getroot() 1.129 + 1.130 + parsed_root_tag = _parse_root_tag(root.tag) 1.131 + 1.132 + if parsed_root_tag == (None, "rss"): 1.133 + version = float(root.get("version", 0.0)) 1.134 + if version >= 2.0: 1.135 + ret = _go_rss(elementTree) 1.136 + else: 1.137 + raise NotImplementedError("Unsupported rss version") 1.138 + elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): 1.139 + ret = _go_atom(elementTree) 1.140 + else: 1.141 + raise NotImplementedError("Unknown root tag") 1.142 + 1.143 + return ret 1.144 + 1.145 + 1.146 +class WorkerThread(threading.Thread): 1.147 + def __init__(self, *args, **kwargs): 1.148 + self._input_queue = kwargs.pop("input_queue") 1.149 + self._output_queue = kwargs.pop("output_queue") 1.150 + threading.Thread.__init__(self, *args, **kwargs) 1.151 + self.daemon = True 1.152 + 1.153 + def run(self): 1.154 + while True: 1.155 + (idx, url) = self._input_queue.get() 1.156 + docfeed = None 1.157 + try: 1.158 + docfeed = _process_url(url) 1.159 + except Exception as e: 1.160 + logging.info("(%s) exception: %s" % (url, e)) 1.161 + self._output_queue.put((idx, docfeed)) 1.162 + 1.163 + 1.164 +def main(input_queue, output_queue): 1.165 + ret = '' 1.166 + 1.167 + epoch_now = time.time() 1.168 + dtnow = datetime.datetime.fromtimestamp(epoch_now) 1.169 + 1.170 + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): 1.171 + with open(CACHE_HTML_FILE) as cache_html_file: 1.172 + ret = cache_html_file.read() 1.173 + 1.174 + else: 1.175 + with open(FEEDS_FILE) as feeds_file: 1.176 + feedlines = feeds_file.readlines() 1.177 + 1.178 + docstruct = [None]*len(feedlines) 1.179 + num_input = 0 1.180 + for (i, l) in enumerate(feedlines): 1.181 + if l[0] != '#': 1.182 + l = l.strip() 1.183 + input_queue.put((i, l)) 1.184 + num_input += 1 1.185 + 1.186 + for _ in range(num_input): 1.187 + (idx, docfeed) = output_queue.get() 1.188 + docstruct[idx] = docfeed 1.189 + 1.190 + ret = _to_html(dtnow, docstruct) 1.191 + 1.192 + with open(CACHE_HTML_FILE, 'w') as cache_html_file: 1.193 + cache_html_file.write(ret) 1.194 + 1.195 + return ret 1.196 + 1.197 + 1.198 +class MyRssApp: 1.199 + def __init__(self): 1.200 + self._iq = Queue.Queue(MAX_THREADS) 1.201 + self._oq = Queue.Queue(MAX_THREADS) 1.202 + 1.203 + for _ in range(MAX_THREADS): 1.204 + WorkerThread(input_queue=self._iq, output_queue=self._oq).start() 1.205 + 1.206 + def __call__(self, environ, start_response): 1.207 + response_body = main(self._iq, self._oq) 1.208 + response_headers = [ 1.209 + ("Content-Type", "text/html"), 1.210 + ("Content-Length", str(len(response_body))), 1.211 + ] 1.212 + start_response("200 OK", response_headers) 1.213 + 1.214 + return [response_body] 1.215 + 1.216 +
2.1 --- a/myrss/myrss_parser.py Mon Feb 04 23:58:02 2013 -0800 2.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 2.3 @@ -1,213 +0,0 @@ 2.4 -import os 2.5 -import sys 2.6 -import re 2.7 -import urllib2 2.8 -import threading 2.9 -import Queue 2.10 -import datetime 2.11 -import time 2.12 -import logging 2.13 -logging.basicConfig(level=logging.INFO) 2.14 - 2.15 -import html 2.16 -import xml.etree.ElementTree 2.17 - 2.18 - 2.19 -FEEDS_FILE = "FEEDS" 2.20 -CACHE_HTML_FILE = "__cache__.html" 2.21 - 2.22 -#CACHE_LIFE = 1200 # [seconds] 2.23 -CACHE_LIFE = 30 # [seconds] 2.24 -MAX_ITEMS = 30 2.25 -MAX_LINK_Z = 4 2.26 -MAX_THREADS = 20 2.27 - 2.28 - 2.29 -_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") 2.30 - 2.31 -def _parse_root_tag(root_tag): 2.32 - re_match = _PARSE_ROOT_TAG_RE.match(root_tag) 2.33 - 2.34 - if re_match is None: 2.35 - return (None, None) 2.36 - else: 2.37 - return re_match.group(2, 3) 2.38 - 2.39 - 2.40 -def _go_rss(elementTree): 2.41 - title = elementTree.find("channel/title").text.strip() 2.42 - link = elementTree.find("channel/link").text 2.43 - 2.44 - items = [] 2.45 - 2.46 - for i in elementTree.findall("channel/item")[:MAX_ITEMS]: 2.47 - it_title = i.find("title").text.strip() 2.48 - it_link = i.find("link").text 2.49 - 2.50 - items.append((it_title, it_link)) 2.51 - 2.52 - return (title, link, items) 2.53 - 2.54 - 2.55 -def _go_atom(elementTree): 2.56 - ns = "http://www.w3.org/2005/Atom" 2.57 - 2.58 - title = elementTree.find("{%s}title" % ns).text.strip() 2.59 - link = '' 2.60 - 2.61 - for i in elementTree.findall("{%s}link" % ns): 2.62 - if i.get("type") == "text/html" and i.get("rel") == "alternate": 2.63 - link = i.get("href") 2.64 - break 2.65 - 2.66 - items = [] 2.67 - 2.68 - for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: 2.69 - it_title = i.find("{%s}title" % ns).text.strip() 2.70 - it_link = '' 2.71 - 2.72 - for j in i.findall("{%s}link" % ns): 2.73 - if j.get("type") == "text/html" and j.get("rel") == "alternate": 2.74 - it_link = j.get("href") 2.75 - break 2.76 - 2.77 - items.append((it_title, it_link)) 2.78 - 2.79 - return (title, link, items) 2.80 - 2.81 - 2.82 -def _to_html(dtnow, docstruct): 2.83 - datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 2.84 - page_title = "myrss -- %s" % datetime_str 2.85 - 2.86 - root = html.HTML("html") 2.87 - 2.88 - header = root.header 2.89 - header.title(page_title) 2.90 - header.link(rel="stylesheet", type="text/css", href="index.css") 2.91 - 2.92 - body = root.body 2.93 - body.h1(page_title) 2.94 - 2.95 - link_z = 0 2.96 - 2.97 - for feed in docstruct: 2.98 - if feed is None: 2.99 - continue 2.100 - 2.101 - (title, link, items) = feed 2.102 - 2.103 - body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 2.104 - link_z += 1 2.105 - p = body.p 2.106 - 2.107 - for (i, (it_title, it_link)) in enumerate(items): 2.108 - if i > 0: 2.109 - p += " - " 2.110 - 2.111 - p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 2.112 - link_z += 1 2.113 - 2.114 - return unicode(root).encode("utf-8") 2.115 - 2.116 - 2.117 -def _process_url(url): 2.118 - ret = None 2.119 - 2.120 - try: 2.121 - logging.info("processing %s" % url) 2.122 - feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) 2.123 - except urllib2.HTTPError as e: 2.124 - logging.info("(%s) %s" % (url, e)) 2.125 - return ret 2.126 - 2.127 - elementTree = xml.etree.ElementTree.parse(feed) 2.128 - root = elementTree.getroot() 2.129 - 2.130 - parsed_root_tag = _parse_root_tag(root.tag) 2.131 - 2.132 - if parsed_root_tag == (None, "rss"): 2.133 - version = float(root.get("version", 0.0)) 2.134 - if version >= 2.0: 2.135 - ret = _go_rss(elementTree) 2.136 - else: 2.137 - raise NotImplementedError("Unsupported rss version") 2.138 - elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): 2.139 - ret = _go_atom(elementTree) 2.140 - else: 2.141 - raise NotImplementedError("Unknown root tag") 2.142 - 2.143 - return ret 2.144 - 2.145 - 2.146 -class WorkerThread(threading.Thread): 2.147 - def __init__(self, *args, **kwargs): 2.148 - self._input_queue = kwargs.pop("input_queue") 2.149 - self._output_queue = kwargs.pop("output_queue") 2.150 - threading.Thread.__init__(self, *args, **kwargs) 2.151 - self.daemon = True 2.152 - 2.153 - def run(self): 2.154 - while True: 2.155 - (idx, url) = self._input_queue.get() 2.156 - docfeed = None 2.157 - try: 2.158 - docfeed = _process_url(url) 2.159 - except Exception as e: 2.160 - logging.info("(%s) exception: %s" % (url, e)) 2.161 - self._output_queue.put((idx, docfeed)) 2.162 - 2.163 - 2.164 -def main(input_queue, output_queue): 2.165 - ret = '' 2.166 - 2.167 - epoch_now = time.time() 2.168 - dtnow = datetime.datetime.fromtimestamp(epoch_now) 2.169 - 2.170 - if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): 2.171 - with open(CACHE_HTML_FILE) as cache_html_file: 2.172 - ret = cache_html_file.read() 2.173 - 2.174 - else: 2.175 - with open(FEEDS_FILE) as feeds_file: 2.176 - feedlines = feeds_file.readlines() 2.177 - 2.178 - docstruct = [None]*len(feedlines) 2.179 - num_input = 0 2.180 - for (i, l) in enumerate(feedlines): 2.181 - if l[0] != '#': 2.182 - l = l.strip() 2.183 - input_queue.put((i, l)) 2.184 - num_input += 1 2.185 - 2.186 - for _ in range(num_input): 2.187 - (idx, docfeed) = output_queue.get() 2.188 - docstruct[idx] = docfeed 2.189 - 2.190 - ret = _to_html(dtnow, docstruct) 2.191 - 2.192 - with open(CACHE_HTML_FILE, 'w') as cache_html_file: 2.193 - cache_html_file.write(ret) 2.194 - 2.195 - return ret 2.196 - 2.197 - 2.198 -class MyRssApp: 2.199 - def __init__(self): 2.200 - self._iq = Queue.Queue(MAX_THREADS) 2.201 - self._oq = Queue.Queue(MAX_THREADS) 2.202 - 2.203 - for _ in range(MAX_THREADS): 2.204 - WorkerThread(input_queue=self._iq, output_queue=self._oq).start() 2.205 - 2.206 - def __call__(self, environ, start_response): 2.207 - response_body = main(self._iq, self._oq) 2.208 - response_headers = [ 2.209 - ("Content-Type", "text/html"), 2.210 - ("Content-Length", str(len(response_body))), 2.211 - ] 2.212 - start_response("200 OK", response_headers) 2.213 - 2.214 - return [response_body] 2.215 - 2.216 -
3.1 --- a/myrss/myrss_test_server.py Mon Feb 04 23:58:02 2013 -0800 3.2 +++ b/myrss/myrss_test_server.py Tue Feb 05 00:01:49 2013 -0800 3.3 @@ -1,7 +1,7 @@ 3.4 import wsgiref.simple_server 3.5 import SocketServer 3.6 3.7 -import myrss_parser 3.8 +import myrss_app 3.9 3.10 3.11 class ThreadingWSGIServer(SocketServer.ThreadingMixIn, wsgiref.simple_server.WSGIServer): 3.12 @@ -10,5 +10,5 @@ 3.13 3.14 if __name__ == "__main__": 3.15 httpd = ThreadingWSGIServer(('', 8000), wsgiref.simple_server.WSGIRequestHandler) 3.16 - httpd.set_app(myrss_parser.MyRssApp()) 3.17 + httpd.set_app(myrss_app.MyRssApp()) 3.18 httpd.serve_forever()