comparison myrss/myrss_app.py @ 98:e2817e789895

myrss: update FEEDS, and add filter_feed() function to remove bad character from the new feed
author paulo
date Wed, 01 Aug 2018 22:33:22 -0700
parents 9bd400576469
children 083e09bb913a
comparison
equal deleted inserted replaced
13:7cf39f3fd427 14:6a023f65511a
5 import threading 5 import threading
6 import Queue 6 import Queue
7 import datetime 7 import datetime
8 import time 8 import time
9 import traceback 9 import traceback
10 import StringIO
10 11
11 import logging 12 import logging
12 #logging.basicConfig( 13 #logging.basicConfig(
13 # level=logging.DEBUG, 14 # level=logging.DEBUG,
14 # filename="_LOG", 15 # filename="_LOG",
166 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT) 167 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT)
167 except urllib2.HTTPError as e: 168 except urllib2.HTTPError as e:
168 logging.info("(%s) %s" % (url, e)) 169 logging.info("(%s) %s" % (url, e))
169 return None 170 return None
170 171
171 return feed 172 return feed.read()
173
174
175 def _filter_feed(feed):
176 ret = feed
177
178 filter_out = ["\x16"]
179 for i in filter_out:
180 ret = ret.replace(i, "")
181
182 return ret
172 183
173 184
174 def _process_feed(feed): 185 def _process_feed(feed):
175 ret = None 186 ret = None
176 187
177 elementTree = xml.etree.ElementTree.parse(feed) 188 feed_sio = StringIO.StringIO(feed)
189 elementTree = xml.etree.ElementTree.parse(feed_sio)
178 root = elementTree.getroot() 190 root = elementTree.getroot()
179 191
180 parsed_root_tag = _parse_root_tag(root.tag) 192 parsed_root_tag = _parse_root_tag(root.tag)
181 193
182 if parsed_root_tag == (None, "rss"): 194 if parsed_root_tag == (None, "rss"):
207 (idx, url) = self._input_queue.get() 219 (idx, url) = self._input_queue.get()
208 docfeed = None 220 docfeed = None
209 try: 221 try:
210 feed = _fetch_url(url) 222 feed = _fetch_url(url)
211 if feed is not None: 223 if feed is not None:
212 docfeed = _process_feed(feed) 224 docfeed = _process_feed(_filter_feed(feed))
213 except Exception as e: 225 except Exception as e:
214 logging.info("(%s) exception: (%s) %s" % (url, type(e), e)) 226 logging.info("(%s) exception: (%s) %s" % (url, type(e), e))
215 self._output_queue.put((idx, docfeed)) 227 self._output_queue.put((idx, docfeed))
216 228
217 229