Mercurial > hg > index.fcgi > www > www-1
comparison myrss/myrss_parser.py @ 41:5f9bc02e9caf
add datetimestamp and caching
author | paulo |
---|---|
date | Fri, 01 Feb 2013 01:26:07 -0800 |
parents | 62464a0034d1 |
children | a1456ecd25b9 |
comparison
equal
deleted
inserted
replaced
1:492d8d321f43 | 2:226e506e9c82 |
---|---|
2 import sys | 2 import sys |
3 import re | 3 import re |
4 import urllib2 | 4 import urllib2 |
5 import threading | 5 import threading |
6 import Queue | 6 import Queue |
7 import datetime | |
8 import time | |
7 | 9 |
8 import html | 10 import html |
9 import xml.etree.ElementTree | 11 import xml.etree.ElementTree |
10 | 12 |
11 | 13 |
14 FEEDS_FILE = "FEEDS" | |
15 CACHE_HTML_FILE = "__cache__.html" | |
16 | |
17 #CACHE_LIFE = 1200 # [seconds] | |
18 CACHE_LIFE = 30 # [seconds] | |
12 MAX_ITEMS = 30 | 19 MAX_ITEMS = 30 |
13 MAX_LINK_Z = 4 | 20 MAX_LINK_Z = 4 |
14 MAX_THREADS = 20 | 21 MAX_THREADS = 20 |
15 | 22 |
16 | 23 |
65 items.append((it_title, it_link)) | 72 items.append((it_title, it_link)) |
66 | 73 |
67 return (title, link, items) | 74 return (title, link, items) |
68 | 75 |
69 | 76 |
70 def _to_html(docstruct): | 77 def _to_html(dtnow, docstruct): |
78 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") | |
79 page_title = "myrss -- %s" % datetime_str | |
80 | |
71 root = html.HTML() | 81 root = html.HTML() |
72 | 82 |
73 header = root.header | 83 header = root.header |
74 header.title("myrss") | 84 header.title(page_title) |
75 header.link(rel="stylesheet", type="text/css", href="index.css") | 85 header.link(rel="stylesheet", type="text/css", href="index.css") |
86 | |
87 body = root.body | |
88 body.h1(page_title) | |
76 | 89 |
77 link_z = 0 | 90 link_z = 0 |
78 | 91 |
79 for feed in docstruct: | 92 for feed in docstruct: |
80 if feed is None: | 93 if feed is None: |
81 continue | 94 continue |
82 | 95 |
83 (title, link, items) = feed | 96 (title, link, items) = feed |
84 | 97 |
85 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) | 98 body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) |
86 link_z += 1 | 99 link_z += 1 |
87 p = root.p | 100 p = body.p |
88 | 101 |
89 for (i, (it_title, it_link)) in enumerate(items): | 102 for (i, (it_title, it_link)) in enumerate(items): |
90 if i > 0: | 103 if i > 0: |
91 p += " - " | 104 p += " - " |
92 | 105 |
99 def _process_url(url): | 112 def _process_url(url): |
100 ret = None | 113 ret = None |
101 | 114 |
102 try: | 115 try: |
103 print >> sys.stderr, "--> processing %s" % url | 116 print >> sys.stderr, "--> processing %s" % url |
104 feed = urllib2.urlopen(url) | 117 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) |
105 except urllib2.HTTPError as e: | 118 except urllib2.HTTPError as e: |
106 print >> sys.stderr, "--> (%s) %s" % (url, e) | 119 print >> sys.stderr, "--> (%s) %s" % (url, e) |
107 return ret | 120 return ret |
108 | 121 |
109 elementTree = xml.etree.ElementTree.parse(feed) | 122 elementTree = xml.etree.ElementTree.parse(feed) |
142 print >> sys.stderr, "--> (%s) exception: %s" % (url, e) | 155 print >> sys.stderr, "--> (%s) exception: %s" % (url, e) |
143 self._output_queue.put((idx, docfeed)) | 156 self._output_queue.put((idx, docfeed)) |
144 self._input_queue.task_done() | 157 self._input_queue.task_done() |
145 | 158 |
146 | 159 |
160 def main(): | |
161 ret = '' | |
162 | |
163 epoch_now = time.time() | |
164 dtnow = datetime.datetime.fromtimestamp(epoch_now) | |
165 | |
166 if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): | |
167 with open(CACHE_HTML_FILE) as cache_html_file: | |
168 ret = cache_html_file.read() | |
169 | |
170 else: | |
171 with open(FEEDS_FILE) as feeds_file: | |
172 feedlines = feeds_file.readlines() | |
173 | |
174 docstruct = [None]*len(feedlines) | |
175 iq = Queue.Queue(feedlines) | |
176 oq = Queue.Queue(feedlines) | |
177 | |
178 for _ in range(MAX_THREADS): | |
179 WorkerThread(input_queue=iq, output_queue=oq).start() | |
180 | |
181 for (i, l) in enumerate(feedlines): | |
182 if l[0] != '#': | |
183 l = l.strip() | |
184 iq.put((i, l)) | |
185 | |
186 iq.join() | |
187 | |
188 while True: | |
189 try: | |
190 (idx, docfeed) = oq.get_nowait() | |
191 docstruct[idx] = docfeed | |
192 except Queue.Empty: | |
193 break | |
194 | |
195 ret = _to_html(dtnow, docstruct) | |
196 | |
197 with open(CACHE_HTML_FILE, 'w') as cache_html_file: | |
198 cache_html_file.write(ret) | |
199 | |
200 return ret | |
201 | |
202 | |
147 if __name__ == "__main__": | 203 if __name__ == "__main__": |
148 with open("FEEDS") as feeds_file: | 204 print main() |
149 feedlines = feeds_file.readlines() | 205 |
150 | |
151 docstruct = [None]*len(feedlines) | |
152 iq = Queue.Queue(feedlines) | |
153 oq = Queue.Queue(feedlines) | |
154 | |
155 for _ in range(MAX_THREADS): | |
156 WorkerThread(input_queue=iq, output_queue=oq).start() | |
157 | |
158 for (i, l) in enumerate(feedlines): | |
159 if l[0] != '#': | |
160 l = l.strip() | |
161 iq.put((i, l)) | |
162 | |
163 iq.join() | |
164 | |
165 while True: | |
166 try: | |
167 (idx, docfeed) = oq.get_nowait() | |
168 docstruct[idx] = docfeed | |
169 except Queue.Empty: | |
170 break | |
171 | |
172 print _to_html(docstruct) |