# HG changeset patch # User paulo # Date 1360051309 28800 # Node ID df6a1a34758444d111acf579097de6a7940bad5b # Parent a1456ecd25b98c63a616c01ac868a66d0ebc443c rename myrss_parser.py to myrss_app.py diff -r a1456ecd25b9 -r df6a1a347584 myrss/myrss_app.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss/myrss_app.py Tue Feb 05 00:01:49 2013 -0800 @@ -0,0 +1,213 @@ +import os +import sys +import re +import urllib2 +import threading +import Queue +import datetime +import time +import logging +logging.basicConfig(level=logging.INFO) + +import html +import xml.etree.ElementTree + + +FEEDS_FILE = "FEEDS" +CACHE_HTML_FILE = "__cache__.html" + +#CACHE_LIFE = 1200 # [seconds] +CACHE_LIFE = 30 # [seconds] +MAX_ITEMS = 30 +MAX_LINK_Z = 4 +MAX_THREADS = 20 + + +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") + +def _parse_root_tag(root_tag): + re_match = _PARSE_ROOT_TAG_RE.match(root_tag) + + if re_match is None: + return (None, None) + else: + return re_match.group(2, 3) + + +def _go_rss(elementTree): + title = elementTree.find("channel/title").text.strip() + link = elementTree.find("channel/link").text + + items = [] + + for i in elementTree.findall("channel/item")[:MAX_ITEMS]: + it_title = i.find("title").text.strip() + it_link = i.find("link").text + + items.append((it_title, it_link)) + + return (title, link, items) + + +def _go_atom(elementTree): + ns = "http://www.w3.org/2005/Atom" + + title = elementTree.find("{%s}title" % ns).text.strip() + link = '' + + for i in elementTree.findall("{%s}link" % ns): + if i.get("type") == "text/html" and i.get("rel") == "alternate": + link = i.get("href") + break + + items = [] + + for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: + it_title = i.find("{%s}title" % ns).text.strip() + it_link = '' + + for j in i.findall("{%s}link" % ns): + if j.get("type") == "text/html" and j.get("rel") == "alternate": + it_link = j.get("href") + break + + items.append((it_title, it_link)) + + return (title, link, items) + + +def _to_html(dtnow, docstruct): + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") + page_title = "myrss -- %s" % datetime_str + + root = html.HTML("html") + + header = root.header + header.title(page_title) + header.link(rel="stylesheet", type="text/css", href="index.css") + + body = root.body + body.h1(page_title) + + link_z = 0 + + for feed in docstruct: + if feed is None: + continue + + (title, link, items) = feed + + body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) + link_z += 1 + p = body.p + + for (i, (it_title, it_link)) in enumerate(items): + if i > 0: + p += " - " + + p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) + link_z += 1 + + return unicode(root).encode("utf-8") + + +def _process_url(url): + ret = None + + try: + logging.info("processing %s" % url) + feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) + except urllib2.HTTPError as e: + logging.info("(%s) %s" % (url, e)) + return ret + + elementTree = xml.etree.ElementTree.parse(feed) + root = elementTree.getroot() + + parsed_root_tag = _parse_root_tag(root.tag) + + if parsed_root_tag == (None, "rss"): + version = float(root.get("version", 0.0)) + if version >= 2.0: + ret = _go_rss(elementTree) + else: + raise NotImplementedError("Unsupported rss version") + elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): + ret = _go_atom(elementTree) + else: + raise NotImplementedError("Unknown root tag") + + return ret + + +class WorkerThread(threading.Thread): + def __init__(self, *args, **kwargs): + self._input_queue = kwargs.pop("input_queue") + self._output_queue = kwargs.pop("output_queue") + threading.Thread.__init__(self, *args, **kwargs) + self.daemon = True + + def run(self): + while True: + (idx, url) = self._input_queue.get() + docfeed = None + try: + docfeed = _process_url(url) + except Exception as e: + logging.info("(%s) exception: %s" % (url, e)) + self._output_queue.put((idx, docfeed)) + + +def main(input_queue, output_queue): + ret = '' + + epoch_now = time.time() + dtnow = datetime.datetime.fromtimestamp(epoch_now) + + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): + with open(CACHE_HTML_FILE) as cache_html_file: + ret = cache_html_file.read() + + else: + with open(FEEDS_FILE) as feeds_file: + feedlines = feeds_file.readlines() + + docstruct = [None]*len(feedlines) + num_input = 0 + for (i, l) in enumerate(feedlines): + if l[0] != '#': + l = l.strip() + input_queue.put((i, l)) + num_input += 1 + + for _ in range(num_input): + (idx, docfeed) = output_queue.get() + docstruct[idx] = docfeed + + ret = _to_html(dtnow, docstruct) + + with open(CACHE_HTML_FILE, 'w') as cache_html_file: + cache_html_file.write(ret) + + return ret + + +class MyRssApp: + def __init__(self): + self._iq = Queue.Queue(MAX_THREADS) + self._oq = Queue.Queue(MAX_THREADS) + + for _ in range(MAX_THREADS): + WorkerThread(input_queue=self._iq, output_queue=self._oq).start() + + def __call__(self, environ, start_response): + response_body = main(self._iq, self._oq) + response_headers = [ + ("Content-Type", "text/html"), + ("Content-Length", str(len(response_body))), + ] + start_response("200 OK", response_headers) + + return [response_body] + + diff -r a1456ecd25b9 -r df6a1a347584 myrss/myrss_parser.py --- a/myrss/myrss_parser.py Mon Feb 04 23:58:02 2013 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,213 +0,0 @@ -import os -import sys -import re -import urllib2 -import threading -import Queue -import datetime -import time -import logging -logging.basicConfig(level=logging.INFO) - -import html -import xml.etree.ElementTree - - -FEEDS_FILE = "FEEDS" -CACHE_HTML_FILE = "__cache__.html" - -#CACHE_LIFE = 1200 # [seconds] -CACHE_LIFE = 30 # [seconds] -MAX_ITEMS = 30 -MAX_LINK_Z = 4 -MAX_THREADS = 20 - - -_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") - -def _parse_root_tag(root_tag): - re_match = _PARSE_ROOT_TAG_RE.match(root_tag) - - if re_match is None: - return (None, None) - else: - return re_match.group(2, 3) - - -def _go_rss(elementTree): - title = elementTree.find("channel/title").text.strip() - link = elementTree.find("channel/link").text - - items = [] - - for i in elementTree.findall("channel/item")[:MAX_ITEMS]: - it_title = i.find("title").text.strip() - it_link = i.find("link").text - - items.append((it_title, it_link)) - - return (title, link, items) - - -def _go_atom(elementTree): - ns = "http://www.w3.org/2005/Atom" - - title = elementTree.find("{%s}title" % ns).text.strip() - link = '' - - for i in elementTree.findall("{%s}link" % ns): - if i.get("type") == "text/html" and i.get("rel") == "alternate": - link = i.get("href") - break - - items = [] - - for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: - it_title = i.find("{%s}title" % ns).text.strip() - it_link = '' - - for j in i.findall("{%s}link" % ns): - if j.get("type") == "text/html" and j.get("rel") == "alternate": - it_link = j.get("href") - break - - items.append((it_title, it_link)) - - return (title, link, items) - - -def _to_html(dtnow, docstruct): - datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") - page_title = "myrss -- %s" % datetime_str - - root = html.HTML("html") - - header = root.header - header.title(page_title) - header.link(rel="stylesheet", type="text/css", href="index.css") - - body = root.body - body.h1(page_title) - - link_z = 0 - - for feed in docstruct: - if feed is None: - continue - - (title, link, items) = feed - - body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) - link_z += 1 - p = body.p - - for (i, (it_title, it_link)) in enumerate(items): - if i > 0: - p += " - " - - p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) - link_z += 1 - - return unicode(root).encode("utf-8") - - -def _process_url(url): - ret = None - - try: - logging.info("processing %s" % url) - feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) - except urllib2.HTTPError as e: - logging.info("(%s) %s" % (url, e)) - return ret - - elementTree = xml.etree.ElementTree.parse(feed) - root = elementTree.getroot() - - parsed_root_tag = _parse_root_tag(root.tag) - - if parsed_root_tag == (None, "rss"): - version = float(root.get("version", 0.0)) - if version >= 2.0: - ret = _go_rss(elementTree) - else: - raise NotImplementedError("Unsupported rss version") - elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): - ret = _go_atom(elementTree) - else: - raise NotImplementedError("Unknown root tag") - - return ret - - -class WorkerThread(threading.Thread): - def __init__(self, *args, **kwargs): - self._input_queue = kwargs.pop("input_queue") - self._output_queue = kwargs.pop("output_queue") - threading.Thread.__init__(self, *args, **kwargs) - self.daemon = True - - def run(self): - while True: - (idx, url) = self._input_queue.get() - docfeed = None - try: - docfeed = _process_url(url) - except Exception as e: - logging.info("(%s) exception: %s" % (url, e)) - self._output_queue.put((idx, docfeed)) - - -def main(input_queue, output_queue): - ret = '' - - epoch_now = time.time() - dtnow = datetime.datetime.fromtimestamp(epoch_now) - - if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): - with open(CACHE_HTML_FILE) as cache_html_file: - ret = cache_html_file.read() - - else: - with open(FEEDS_FILE) as feeds_file: - feedlines = feeds_file.readlines() - - docstruct = [None]*len(feedlines) - num_input = 0 - for (i, l) in enumerate(feedlines): - if l[0] != '#': - l = l.strip() - input_queue.put((i, l)) - num_input += 1 - - for _ in range(num_input): - (idx, docfeed) = output_queue.get() - docstruct[idx] = docfeed - - ret = _to_html(dtnow, docstruct) - - with open(CACHE_HTML_FILE, 'w') as cache_html_file: - cache_html_file.write(ret) - - return ret - - -class MyRssApp: - def __init__(self): - self._iq = Queue.Queue(MAX_THREADS) - self._oq = Queue.Queue(MAX_THREADS) - - for _ in range(MAX_THREADS): - WorkerThread(input_queue=self._iq, output_queue=self._oq).start() - - def __call__(self, environ, start_response): - response_body = main(self._iq, self._oq) - response_headers = [ - ("Content-Type", "text/html"), - ("Content-Length", str(len(response_body))), - ] - start_response("200 OK", response_headers) - - return [response_body] - - diff -r a1456ecd25b9 -r df6a1a347584 myrss/myrss_test_server.py --- a/myrss/myrss_test_server.py Mon Feb 04 23:58:02 2013 -0800 +++ b/myrss/myrss_test_server.py Tue Feb 05 00:01:49 2013 -0800 @@ -1,7 +1,7 @@ import wsgiref.simple_server import SocketServer -import myrss_parser +import myrss_app class ThreadingWSGIServer(SocketServer.ThreadingMixIn, wsgiref.simple_server.WSGIServer): @@ -10,5 +10,5 @@ if __name__ == "__main__": httpd = ThreadingWSGIServer(('', 8000), wsgiref.simple_server.WSGIRequestHandler) - httpd.set_app(myrss_parser.MyRssApp()) + httpd.set_app(myrss_app.MyRssApp()) httpd.serve_forever()