Mercurial > hg > index.fcgi > www > www-1
diff myrss/myrss_parser.py @ 41:5f9bc02e9caf
add datetimestamp and caching
author | paulo |
---|---|
date | Fri, 01 Feb 2013 01:26:07 -0800 |
parents | 62464a0034d1 |
children | a1456ecd25b9 |
line diff
1.1 --- a/myrss/myrss_parser.py Thu Jan 31 02:19:39 2013 -0800 1.2 +++ b/myrss/myrss_parser.py Fri Feb 01 01:26:07 2013 -0800 1.3 @@ -4,11 +4,18 @@ 1.4 import urllib2 1.5 import threading 1.6 import Queue 1.7 +import datetime 1.8 +import time 1.9 1.10 import html 1.11 import xml.etree.ElementTree 1.12 1.13 1.14 +FEEDS_FILE = "FEEDS" 1.15 +CACHE_HTML_FILE = "__cache__.html" 1.16 + 1.17 +#CACHE_LIFE = 1200 # [seconds] 1.18 +CACHE_LIFE = 30 # [seconds] 1.19 MAX_ITEMS = 30 1.20 MAX_LINK_Z = 4 1.21 MAX_THREADS = 20 1.22 @@ -67,13 +74,19 @@ 1.23 return (title, link, items) 1.24 1.25 1.26 -def _to_html(docstruct): 1.27 +def _to_html(dtnow, docstruct): 1.28 + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 1.29 + page_title = "myrss -- %s" % datetime_str 1.30 + 1.31 root = html.HTML() 1.32 1.33 header = root.header 1.34 - header.title("myrss") 1.35 + header.title(page_title) 1.36 header.link(rel="stylesheet", type="text/css", href="index.css") 1.37 1.38 + body = root.body 1.39 + body.h1(page_title) 1.40 + 1.41 link_z = 0 1.42 1.43 for feed in docstruct: 1.44 @@ -82,9 +95,9 @@ 1.45 1.46 (title, link, items) = feed 1.47 1.48 - root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.49 + body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.50 link_z += 1 1.51 - p = root.p 1.52 + p = body.p 1.53 1.54 for (i, (it_title, it_link)) in enumerate(items): 1.55 if i > 0: 1.56 @@ -101,7 +114,7 @@ 1.57 1.58 try: 1.59 print >> sys.stderr, "--> processing %s" % url 1.60 - feed = urllib2.urlopen(url) 1.61 + feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) 1.62 except urllib2.HTTPError as e: 1.63 print >> sys.stderr, "--> (%s) %s" % (url, e) 1.64 return ret 1.65 @@ -144,29 +157,49 @@ 1.66 self._input_queue.task_done() 1.67 1.68 1.69 +def main(): 1.70 + ret = '' 1.71 + 1.72 + epoch_now = time.time() 1.73 + dtnow = datetime.datetime.fromtimestamp(epoch_now) 1.74 + 1.75 + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): 1.76 + with open(CACHE_HTML_FILE) as cache_html_file: 1.77 + ret = cache_html_file.read() 1.78 + 1.79 + else: 1.80 + with open(FEEDS_FILE) as feeds_file: 1.81 + feedlines = feeds_file.readlines() 1.82 + 1.83 + docstruct = [None]*len(feedlines) 1.84 + iq = Queue.Queue(feedlines) 1.85 + oq = Queue.Queue(feedlines) 1.86 + 1.87 + for _ in range(MAX_THREADS): 1.88 + WorkerThread(input_queue=iq, output_queue=oq).start() 1.89 + 1.90 + for (i, l) in enumerate(feedlines): 1.91 + if l[0] != '#': 1.92 + l = l.strip() 1.93 + iq.put((i, l)) 1.94 + 1.95 + iq.join() 1.96 + 1.97 + while True: 1.98 + try: 1.99 + (idx, docfeed) = oq.get_nowait() 1.100 + docstruct[idx] = docfeed 1.101 + except Queue.Empty: 1.102 + break 1.103 + 1.104 + ret = _to_html(dtnow, docstruct) 1.105 + 1.106 + with open(CACHE_HTML_FILE, 'w') as cache_html_file: 1.107 + cache_html_file.write(ret) 1.108 + 1.109 + return ret 1.110 + 1.111 + 1.112 if __name__ == "__main__": 1.113 - with open("FEEDS") as feeds_file: 1.114 - feedlines = feeds_file.readlines() 1.115 + print main() 1.116 1.117 - docstruct = [None]*len(feedlines) 1.118 - iq = Queue.Queue(feedlines) 1.119 - oq = Queue.Queue(feedlines) 1.120 - 1.121 - for _ in range(MAX_THREADS): 1.122 - WorkerThread(input_queue=iq, output_queue=oq).start() 1.123 - 1.124 - for (i, l) in enumerate(feedlines): 1.125 - if l[0] != '#': 1.126 - l = l.strip() 1.127 - iq.put((i, l)) 1.128 - 1.129 - iq.join() 1.130 - 1.131 - while True: 1.132 - try: 1.133 - (idx, docfeed) = oq.get_nowait() 1.134 - docstruct[idx] = docfeed 1.135 - except Queue.Empty: 1.136 - break 1.137 - 1.138 - print _to_html(docstruct)