annotate myrss/myrss_parser.py @ 39:915032dd35f4

add myrss2
author paulo
date Wed, 30 Jan 2013 02:32:22 -0800
parents
children 62464a0034d1
rev   line source
paulo@39 1 import os
paulo@39 2 import re
paulo@39 3
paulo@39 4 import html
paulo@39 5 import xml.etree.ElementTree
paulo@39 6
paulo@39 7
paulo@39 8 MAX_ITEMS = 30
paulo@39 9 MAX_LINK_Z = 4
paulo@39 10
paulo@39 11
paulo@39 12 _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
paulo@39 13
paulo@39 14 def _parse_root_tag(root_tag):
paulo@39 15 re_match = _PARSE_ROOT_TAG_RE.match(root_tag)
paulo@39 16
paulo@39 17 if re_match is None:
paulo@39 18 return (None, None)
paulo@39 19 else:
paulo@39 20 return re_match.group(2, 3)
paulo@39 21
paulo@39 22
paulo@39 23 def _go_rss(elementTree):
paulo@39 24 title = elementTree.find("channel/title").text.strip()
paulo@39 25 link = elementTree.find("channel/link").text
paulo@39 26
paulo@39 27 items = []
paulo@39 28
paulo@39 29 for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
paulo@39 30 it_title = i.find("title").text.strip()
paulo@39 31 it_link = i.find("link").text
paulo@39 32
paulo@39 33 items.append((it_title, it_link))
paulo@39 34
paulo@39 35 return (title, link, items)
paulo@39 36
paulo@39 37
paulo@39 38 def _go_atom(elementTree):
paulo@39 39 ns = "http://www.w3.org/2005/Atom"
paulo@39 40
paulo@39 41 title = elementTree.find("{%s}title" % ns).text.strip()
paulo@39 42 link = ''
paulo@39 43
paulo@39 44 for i in elementTree.findall("{%s}link" % ns):
paulo@39 45 if i.get("type") == "text/html" and i.get("rel") == "alternate":
paulo@39 46 link = i.get("href")
paulo@39 47 break
paulo@39 48
paulo@39 49 items = []
paulo@39 50
paulo@39 51 for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
paulo@39 52 it_title = i.find("{%s}title" % ns).text.strip()
paulo@39 53 it_link = ''
paulo@39 54
paulo@39 55 for j in i.findall("{%s}link" % ns):
paulo@39 56 if j.get("type") == "text/html" and j.get("rel") == "alternate":
paulo@39 57 it_link = j.get("href")
paulo@39 58 break
paulo@39 59
paulo@39 60 items.append((it_title, it_link))
paulo@39 61
paulo@39 62 return (title, link, items)
paulo@39 63
paulo@39 64
paulo@39 65 def _to_html(docstruct):
paulo@39 66 root = html.HTML()
paulo@39 67
paulo@39 68 header = root.header
paulo@39 69 header.title("myrss")
paulo@39 70 header.link(rel="stylesheet", type="text/css", href="index.css")
paulo@39 71
paulo@39 72 link_z = 0
paulo@39 73
paulo@39 74 for feed in docstruct:
paulo@39 75 (title, link, items) = feed
paulo@39 76
paulo@39 77 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
paulo@39 78 link_z += 1
paulo@39 79 p = root.p
paulo@39 80
paulo@39 81 for (i, (it_title, it_link)) in enumerate(items):
paulo@39 82 if i > 0:
paulo@39 83 p += " - "
paulo@39 84
paulo@39 85 p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
paulo@39 86 link_z += 1
paulo@39 87
paulo@39 88 return unicode(root).encode("utf-8")
paulo@39 89
paulo@39 90
paulo@39 91 if __name__ == "__main__":
paulo@39 92
paulo@39 93 docstruct = []
paulo@39 94 XMLFILE = "%d.feedtmp"
paulo@39 95
paulo@39 96 for i in range(31):
paulo@39 97 if os.path.exists(XMLFILE % i):
paulo@39 98 elementTree = xml.etree.ElementTree.parse(XMLFILE % i)
paulo@39 99 root = elementTree.getroot()
paulo@39 100
paulo@39 101 if _parse_root_tag(root.tag) == (None, "rss"):
paulo@39 102 version = float(root.get("version", 0.0))
paulo@39 103 if version >= 2.0:
paulo@39 104 docstruct.append(_go_rss(elementTree))
paulo@39 105 else:
paulo@39 106 raise NotImplementedError("Unsupported rss version")
paulo@39 107 elif _parse_root_tag(root.tag) == ("http://www.w3.org/2005/Atom", "feed"):
paulo@39 108 docstruct.append(_go_atom(elementTree))
paulo@39 109 else:
paulo@39 110 raise NotImplementedError("Unknown root tag")
paulo@39 111
paulo@39 112 if len(docstruct) > 0:
paulo@39 113 print _to_html(docstruct)
paulo@39 114 else:
paulo@39 115 raise RuntimeError("Could not produce docstruct")