# HG changeset patch # User paulo # Date 1359710767 28800 # Node ID 5f9bc02e9cafdda4b0a31b33693d5af5f5111a31 # Parent 62464a0034d148907fc74fe57c4ecad864aa881d add datetimestamp and caching diff -r 62464a0034d1 -r 5f9bc02e9caf myrss/myrss_parser.py --- a/myrss/myrss_parser.py Thu Jan 31 02:19:39 2013 -0800 +++ b/myrss/myrss_parser.py Fri Feb 01 01:26:07 2013 -0800 @@ -4,11 +4,18 @@ import urllib2 import threading import Queue +import datetime +import time import html import xml.etree.ElementTree +FEEDS_FILE = "FEEDS" +CACHE_HTML_FILE = "__cache__.html" + +#CACHE_LIFE = 1200 # [seconds] +CACHE_LIFE = 30 # [seconds] MAX_ITEMS = 30 MAX_LINK_Z = 4 MAX_THREADS = 20 @@ -67,13 +74,19 @@ return (title, link, items) -def _to_html(docstruct): +def _to_html(dtnow, docstruct): + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") + page_title = "myrss -- %s" % datetime_str + root = html.HTML() header = root.header - header.title("myrss") + header.title(page_title) header.link(rel="stylesheet", type="text/css", href="index.css") + body = root.body + body.h1(page_title) + link_z = 0 for feed in docstruct: @@ -82,9 +95,9 @@ (title, link, items) = feed - root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) + body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) link_z += 1 - p = root.p + p = body.p for (i, (it_title, it_link)) in enumerate(items): if i > 0: @@ -101,7 +114,7 @@ try: print >> sys.stderr, "--> processing %s" % url - feed = urllib2.urlopen(url) + feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) except urllib2.HTTPError as e: print >> sys.stderr, "--> (%s) %s" % (url, e) return ret @@ -144,29 +157,49 @@ self._input_queue.task_done() +def main(): + ret = '' + + epoch_now = time.time() + dtnow = datetime.datetime.fromtimestamp(epoch_now) + + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): + with open(CACHE_HTML_FILE) as cache_html_file: + ret = cache_html_file.read() + + else: + with open(FEEDS_FILE) as feeds_file: + feedlines = feeds_file.readlines() + + docstruct = [None]*len(feedlines) + iq = Queue.Queue(feedlines) + oq = Queue.Queue(feedlines) + + for _ in range(MAX_THREADS): + WorkerThread(input_queue=iq, output_queue=oq).start() + + for (i, l) in enumerate(feedlines): + if l[0] != '#': + l = l.strip() + iq.put((i, l)) + + iq.join() + + while True: + try: + (idx, docfeed) = oq.get_nowait() + docstruct[idx] = docfeed + except Queue.Empty: + break + + ret = _to_html(dtnow, docstruct) + + with open(CACHE_HTML_FILE, 'w') as cache_html_file: + cache_html_file.write(ret) + + return ret + + if __name__ == "__main__": - with open("FEEDS") as feeds_file: - feedlines = feeds_file.readlines() + print main() - docstruct = [None]*len(feedlines) - iq = Queue.Queue(feedlines) - oq = Queue.Queue(feedlines) - - for _ in range(MAX_THREADS): - WorkerThread(input_queue=iq, output_queue=oq).start() - - for (i, l) in enumerate(feedlines): - if l[0] != '#': - l = l.strip() - iq.put((i, l)) - - iq.join() - - while True: - try: - (idx, docfeed) = oq.get_nowait() - docstruct[idx] = docfeed - except Queue.Empty: - break - - print _to_html(docstruct)