changeset 43:df6a1a347584

rename myrss_parser.py to myrss_app.py
author paulo
date Tue, 05 Feb 2013 00:01:49 -0800
parents a1456ecd25b9
children c673e9e9c4ca
files myrss/myrss_app.py myrss/myrss_parser.py myrss/myrss_test_server.py
diffstat 3 files changed, 215 insertions(+), 215 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/myrss/myrss_app.py	Tue Feb 05 00:01:49 2013 -0800
     1.3 @@ -0,0 +1,213 @@
     1.4 +import os
     1.5 +import sys
     1.6 +import re
     1.7 +import urllib2
     1.8 +import threading
     1.9 +import Queue
    1.10 +import datetime
    1.11 +import time
    1.12 +import logging
    1.13 +logging.basicConfig(level=logging.INFO)
    1.14 +
    1.15 +import html
    1.16 +import xml.etree.ElementTree 
    1.17 +
    1.18 +
    1.19 +FEEDS_FILE = "FEEDS"
    1.20 +CACHE_HTML_FILE = "__cache__.html"
    1.21 +
    1.22 +#CACHE_LIFE = 1200 # [seconds]
    1.23 +CACHE_LIFE = 30 # [seconds]
    1.24 +MAX_ITEMS = 30
    1.25 +MAX_LINK_Z = 4
    1.26 +MAX_THREADS = 20
    1.27 +
    1.28 +
    1.29 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
    1.30 +
    1.31 +def _parse_root_tag(root_tag):
    1.32 +	re_match = _PARSE_ROOT_TAG_RE.match(root_tag)
    1.33 +
    1.34 +	if re_match is None:
    1.35 +		return (None, None)
    1.36 +	else:
    1.37 +		return re_match.group(2, 3)
    1.38 +	
    1.39 +
    1.40 +def _go_rss(elementTree):
    1.41 +	title = elementTree.find("channel/title").text.strip()
    1.42 +	link = elementTree.find("channel/link").text
    1.43 +
    1.44 +	items = []
    1.45 +
    1.46 +	for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
    1.47 +		it_title = i.find("title").text.strip()
    1.48 +		it_link = i.find("link").text
    1.49 +
    1.50 +		items.append((it_title, it_link))
    1.51 +
    1.52 +	return (title, link, items)
    1.53 +
    1.54 +
    1.55 +def _go_atom(elementTree):
    1.56 +	ns = "http://www.w3.org/2005/Atom"
    1.57 +
    1.58 +	title = elementTree.find("{%s}title" % ns).text.strip()
    1.59 +	link = ''
    1.60 +
    1.61 +	for i in elementTree.findall("{%s}link" % ns):
    1.62 +		if i.get("type") == "text/html" and i.get("rel") == "alternate":
    1.63 +			link = i.get("href")
    1.64 +			break
    1.65 +
    1.66 +	items = []
    1.67 +
    1.68 +	for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
    1.69 +		it_title = i.find("{%s}title" % ns).text.strip()
    1.70 +		it_link = ''
    1.71 +		
    1.72 +		for j in i.findall("{%s}link" % ns):
    1.73 +			if j.get("type") == "text/html" and j.get("rel") == "alternate":
    1.74 +				it_link = j.get("href")
    1.75 +				break
    1.76 +
    1.77 +		items.append((it_title, it_link))
    1.78 +
    1.79 +	return (title, link, items)
    1.80 +
    1.81 +
    1.82 +def _to_html(dtnow, docstruct):
    1.83 +	datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
    1.84 +	page_title = "myrss -- %s" % datetime_str
    1.85 +
    1.86 +	root = html.HTML("html")
    1.87 +
    1.88 +	header = root.header
    1.89 +	header.title(page_title)
    1.90 +	header.link(rel="stylesheet", type="text/css", href="index.css")
    1.91 +
    1.92 +	body = root.body
    1.93 +	body.h1(page_title)
    1.94 +
    1.95 +	link_z = 0
    1.96 +
    1.97 +	for feed in docstruct:
    1.98 +		if feed is None:
    1.99 +			continue
   1.100 +
   1.101 +		(title, link, items) = feed
   1.102 +
   1.103 +		body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
   1.104 +		link_z += 1
   1.105 +		p = body.p
   1.106 +
   1.107 +		for (i, (it_title, it_link)) in enumerate(items):
   1.108 +			if i > 0:
   1.109 +				p += " - "
   1.110 +
   1.111 +			p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
   1.112 +			link_z += 1
   1.113 +
   1.114 +	return unicode(root).encode("utf-8")
   1.115 +
   1.116 +
   1.117 +def _process_url(url):
   1.118 +	ret = None
   1.119 +
   1.120 +	try:
   1.121 +		logging.info("processing %s" % url)
   1.122 +		feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}))
   1.123 +	except urllib2.HTTPError as e:
   1.124 +		logging.info("(%s) %s" % (url, e))
   1.125 +		return ret
   1.126 +
   1.127 +	elementTree = xml.etree.ElementTree.parse(feed)
   1.128 +	root = elementTree.getroot()
   1.129 +
   1.130 +	parsed_root_tag = _parse_root_tag(root.tag) 
   1.131 +
   1.132 +	if parsed_root_tag == (None, "rss"):
   1.133 +		version = float(root.get("version", 0.0))
   1.134 +		if version >= 2.0:
   1.135 +			ret = _go_rss(elementTree)
   1.136 +		else:
   1.137 +			raise NotImplementedError("Unsupported rss version")
   1.138 +	elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"):
   1.139 +		ret = _go_atom(elementTree)
   1.140 +	else:
   1.141 +		raise NotImplementedError("Unknown root tag")
   1.142 +
   1.143 +	return ret
   1.144 +
   1.145 +
   1.146 +class WorkerThread(threading.Thread):
   1.147 +	def __init__(self, *args, **kwargs):
   1.148 +		self._input_queue = kwargs.pop("input_queue")
   1.149 +		self._output_queue = kwargs.pop("output_queue")
   1.150 +		threading.Thread.__init__(self, *args, **kwargs)
   1.151 +		self.daemon = True
   1.152 +
   1.153 +	def run(self):
   1.154 +		while True:
   1.155 +			(idx, url) = self._input_queue.get()
   1.156 +			docfeed = None
   1.157 +			try:
   1.158 +				docfeed = _process_url(url)
   1.159 +			except Exception as e:
   1.160 +				logging.info("(%s) exception: %s" % (url, e))
   1.161 +			self._output_queue.put((idx, docfeed))
   1.162 +			
   1.163 +
   1.164 +def main(input_queue, output_queue):
   1.165 +	ret = ''
   1.166 +
   1.167 +	epoch_now = time.time()
   1.168 +	dtnow = datetime.datetime.fromtimestamp(epoch_now)
   1.169 +
   1.170 +	if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE):
   1.171 +		with open(CACHE_HTML_FILE) as cache_html_file:
   1.172 +			ret = cache_html_file.read()
   1.173 +
   1.174 +	else:
   1.175 +		with open(FEEDS_FILE) as feeds_file:
   1.176 +			feedlines = feeds_file.readlines()
   1.177 +
   1.178 +		docstruct = [None]*len(feedlines)
   1.179 +		num_input = 0
   1.180 +		for (i, l) in enumerate(feedlines):
   1.181 +			if l[0] != '#':
   1.182 +				l = l.strip()
   1.183 +				input_queue.put((i, l))
   1.184 +				num_input += 1
   1.185 +
   1.186 +		for _ in range(num_input):
   1.187 +			(idx, docfeed) = output_queue.get()
   1.188 +			docstruct[idx] = docfeed
   1.189 +
   1.190 +		ret = _to_html(dtnow, docstruct)
   1.191 +
   1.192 +		with open(CACHE_HTML_FILE, 'w') as cache_html_file:
   1.193 +			cache_html_file.write(ret)
   1.194 +
   1.195 +	return ret
   1.196 +
   1.197 +
   1.198 +class MyRssApp:
   1.199 +	def __init__(self):
   1.200 +		self._iq = Queue.Queue(MAX_THREADS)
   1.201 +		self._oq = Queue.Queue(MAX_THREADS)
   1.202 +
   1.203 +		for _ in range(MAX_THREADS):
   1.204 +			WorkerThread(input_queue=self._iq, output_queue=self._oq).start()
   1.205 +
   1.206 +	def __call__(self, environ, start_response):
   1.207 +		response_body = main(self._iq, self._oq)
   1.208 +		response_headers = [
   1.209 +			("Content-Type", "text/html"),
   1.210 +			("Content-Length", str(len(response_body))),
   1.211 +		]
   1.212 +		start_response("200 OK", response_headers)
   1.213 +
   1.214 +		return [response_body]
   1.215 +
   1.216 +
     2.1 --- a/myrss/myrss_parser.py	Mon Feb 04 23:58:02 2013 -0800
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,213 +0,0 @@
     2.4 -import os
     2.5 -import sys
     2.6 -import re
     2.7 -import urllib2
     2.8 -import threading
     2.9 -import Queue
    2.10 -import datetime
    2.11 -import time
    2.12 -import logging
    2.13 -logging.basicConfig(level=logging.INFO)
    2.14 -
    2.15 -import html
    2.16 -import xml.etree.ElementTree 
    2.17 -
    2.18 -
    2.19 -FEEDS_FILE = "FEEDS"
    2.20 -CACHE_HTML_FILE = "__cache__.html"
    2.21 -
    2.22 -#CACHE_LIFE = 1200 # [seconds]
    2.23 -CACHE_LIFE = 30 # [seconds]
    2.24 -MAX_ITEMS = 30
    2.25 -MAX_LINK_Z = 4
    2.26 -MAX_THREADS = 20
    2.27 -
    2.28 -
    2.29 -_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
    2.30 -
    2.31 -def _parse_root_tag(root_tag):
    2.32 -	re_match = _PARSE_ROOT_TAG_RE.match(root_tag)
    2.33 -
    2.34 -	if re_match is None:
    2.35 -		return (None, None)
    2.36 -	else:
    2.37 -		return re_match.group(2, 3)
    2.38 -	
    2.39 -
    2.40 -def _go_rss(elementTree):
    2.41 -	title = elementTree.find("channel/title").text.strip()
    2.42 -	link = elementTree.find("channel/link").text
    2.43 -
    2.44 -	items = []
    2.45 -
    2.46 -	for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
    2.47 -		it_title = i.find("title").text.strip()
    2.48 -		it_link = i.find("link").text
    2.49 -
    2.50 -		items.append((it_title, it_link))
    2.51 -
    2.52 -	return (title, link, items)
    2.53 -
    2.54 -
    2.55 -def _go_atom(elementTree):
    2.56 -	ns = "http://www.w3.org/2005/Atom"
    2.57 -
    2.58 -	title = elementTree.find("{%s}title" % ns).text.strip()
    2.59 -	link = ''
    2.60 -
    2.61 -	for i in elementTree.findall("{%s}link" % ns):
    2.62 -		if i.get("type") == "text/html" and i.get("rel") == "alternate":
    2.63 -			link = i.get("href")
    2.64 -			break
    2.65 -
    2.66 -	items = []
    2.67 -
    2.68 -	for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
    2.69 -		it_title = i.find("{%s}title" % ns).text.strip()
    2.70 -		it_link = ''
    2.71 -		
    2.72 -		for j in i.findall("{%s}link" % ns):
    2.73 -			if j.get("type") == "text/html" and j.get("rel") == "alternate":
    2.74 -				it_link = j.get("href")
    2.75 -				break
    2.76 -
    2.77 -		items.append((it_title, it_link))
    2.78 -
    2.79 -	return (title, link, items)
    2.80 -
    2.81 -
    2.82 -def _to_html(dtnow, docstruct):
    2.83 -	datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
    2.84 -	page_title = "myrss -- %s" % datetime_str
    2.85 -
    2.86 -	root = html.HTML("html")
    2.87 -
    2.88 -	header = root.header
    2.89 -	header.title(page_title)
    2.90 -	header.link(rel="stylesheet", type="text/css", href="index.css")
    2.91 -
    2.92 -	body = root.body
    2.93 -	body.h1(page_title)
    2.94 -
    2.95 -	link_z = 0
    2.96 -
    2.97 -	for feed in docstruct:
    2.98 -		if feed is None:
    2.99 -			continue
   2.100 -
   2.101 -		(title, link, items) = feed
   2.102 -
   2.103 -		body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
   2.104 -		link_z += 1
   2.105 -		p = body.p
   2.106 -
   2.107 -		for (i, (it_title, it_link)) in enumerate(items):
   2.108 -			if i > 0:
   2.109 -				p += " - "
   2.110 -
   2.111 -			p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
   2.112 -			link_z += 1
   2.113 -
   2.114 -	return unicode(root).encode("utf-8")
   2.115 -
   2.116 -
   2.117 -def _process_url(url):
   2.118 -	ret = None
   2.119 -
   2.120 -	try:
   2.121 -		logging.info("processing %s" % url)
   2.122 -		feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}))
   2.123 -	except urllib2.HTTPError as e:
   2.124 -		logging.info("(%s) %s" % (url, e))
   2.125 -		return ret
   2.126 -
   2.127 -	elementTree = xml.etree.ElementTree.parse(feed)
   2.128 -	root = elementTree.getroot()
   2.129 -
   2.130 -	parsed_root_tag = _parse_root_tag(root.tag) 
   2.131 -
   2.132 -	if parsed_root_tag == (None, "rss"):
   2.133 -		version = float(root.get("version", 0.0))
   2.134 -		if version >= 2.0:
   2.135 -			ret = _go_rss(elementTree)
   2.136 -		else:
   2.137 -			raise NotImplementedError("Unsupported rss version")
   2.138 -	elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"):
   2.139 -		ret = _go_atom(elementTree)
   2.140 -	else:
   2.141 -		raise NotImplementedError("Unknown root tag")
   2.142 -
   2.143 -	return ret
   2.144 -
   2.145 -
   2.146 -class WorkerThread(threading.Thread):
   2.147 -	def __init__(self, *args, **kwargs):
   2.148 -		self._input_queue = kwargs.pop("input_queue")
   2.149 -		self._output_queue = kwargs.pop("output_queue")
   2.150 -		threading.Thread.__init__(self, *args, **kwargs)
   2.151 -		self.daemon = True
   2.152 -
   2.153 -	def run(self):
   2.154 -		while True:
   2.155 -			(idx, url) = self._input_queue.get()
   2.156 -			docfeed = None
   2.157 -			try:
   2.158 -				docfeed = _process_url(url)
   2.159 -			except Exception as e:
   2.160 -				logging.info("(%s) exception: %s" % (url, e))
   2.161 -			self._output_queue.put((idx, docfeed))
   2.162 -			
   2.163 -
   2.164 -def main(input_queue, output_queue):
   2.165 -	ret = ''
   2.166 -
   2.167 -	epoch_now = time.time()
   2.168 -	dtnow = datetime.datetime.fromtimestamp(epoch_now)
   2.169 -
   2.170 -	if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE):
   2.171 -		with open(CACHE_HTML_FILE) as cache_html_file:
   2.172 -			ret = cache_html_file.read()
   2.173 -
   2.174 -	else:
   2.175 -		with open(FEEDS_FILE) as feeds_file:
   2.176 -			feedlines = feeds_file.readlines()
   2.177 -
   2.178 -		docstruct = [None]*len(feedlines)
   2.179 -		num_input = 0
   2.180 -		for (i, l) in enumerate(feedlines):
   2.181 -			if l[0] != '#':
   2.182 -				l = l.strip()
   2.183 -				input_queue.put((i, l))
   2.184 -				num_input += 1
   2.185 -
   2.186 -		for _ in range(num_input):
   2.187 -			(idx, docfeed) = output_queue.get()
   2.188 -			docstruct[idx] = docfeed
   2.189 -
   2.190 -		ret = _to_html(dtnow, docstruct)
   2.191 -
   2.192 -		with open(CACHE_HTML_FILE, 'w') as cache_html_file:
   2.193 -			cache_html_file.write(ret)
   2.194 -
   2.195 -	return ret
   2.196 -
   2.197 -
   2.198 -class MyRssApp:
   2.199 -	def __init__(self):
   2.200 -		self._iq = Queue.Queue(MAX_THREADS)
   2.201 -		self._oq = Queue.Queue(MAX_THREADS)
   2.202 -
   2.203 -		for _ in range(MAX_THREADS):
   2.204 -			WorkerThread(input_queue=self._iq, output_queue=self._oq).start()
   2.205 -
   2.206 -	def __call__(self, environ, start_response):
   2.207 -		response_body = main(self._iq, self._oq)
   2.208 -		response_headers = [
   2.209 -			("Content-Type", "text/html"),
   2.210 -			("Content-Length", str(len(response_body))),
   2.211 -		]
   2.212 -		start_response("200 OK", response_headers)
   2.213 -
   2.214 -		return [response_body]
   2.215 -
   2.216 -
     3.1 --- a/myrss/myrss_test_server.py	Mon Feb 04 23:58:02 2013 -0800
     3.2 +++ b/myrss/myrss_test_server.py	Tue Feb 05 00:01:49 2013 -0800
     3.3 @@ -1,7 +1,7 @@
     3.4  import wsgiref.simple_server 
     3.5  import SocketServer
     3.6  
     3.7 -import myrss_parser
     3.8 +import myrss_app
     3.9  
    3.10  
    3.11  class ThreadingWSGIServer(SocketServer.ThreadingMixIn, wsgiref.simple_server.WSGIServer):
    3.12 @@ -10,5 +10,5 @@
    3.13  
    3.14  if __name__ == "__main__":
    3.15  	httpd = ThreadingWSGIServer(('', 8000), wsgiref.simple_server.WSGIRequestHandler)
    3.16 -	httpd.set_app(myrss_parser.MyRssApp())
    3.17 +	httpd.set_app(myrss_app.MyRssApp())
    3.18  	httpd.serve_forever()