Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coercedates plugin with fixes to spider.py allowing filters to modify the updated date #16

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions filters/coercedates.plugin
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# If you don't want items to "move up" on your planet if the source feed
# updates them (and changes the update date to something newer then was
# originally used) you may be tempted to use the "ignore_in_feed: updated"
# option, but there are three important things to realize about doing this:
#
# * When you ignore the "updated" date, it will default to the
# "published" date -- but if there is no "published" date (very common
# in many RSS feeds) it will default to the current date+time.
#
# * If you purge the entire cache (perhaps because you added a filter)
# all of the "updated" dates for those items w/o a "published" date will
# be re-set to the current date+time
#
# * The "updated" date is what Venus uses to sort the list
#
# This may seem all seem obvious, but can be highly annoying when you deal
# with some feeds that have no "published" date and have to occasionally
# purge your cache.
#
# One solution would be to only use "ignore_in_feed: updated" on the feeds
# where you know they feed contains a "published" date for each item, and
# don't use it for feeds that only contain an "updated" date for each item
# -- but that can be tedious.
#
# So use this plugin instead
#
# This plugin will replace the "updated" and "published" dates of every item
# with whichever of the two values is the lowest, unless the item is already
# in the cache, in which case it will use the "updated" date from the item in
# the cache -- making it a safe alternative to "ignore_in_feed: updated" for
# all feeds regardless of whether the items have a "published" date or not,
# and regardless of whether the ones that do have a "published" date try to
# modify it or not.
#
###########################################################################

import sys, time, os
from xml.dom import minidom
import planet
from planet import reconstitute
from planet import config
from planet.reconstitute import date
from planet.spider import filename

log = planet.logger

# finds the first descendent element that matches the specified
# namespace and tag name, parses it (in canonical date format),
# returns the parsed value, and removes (all of the) element(s)
def parseAndPurgeDateElement(element, ns, tagName):
result = None
# see if we have any date(s?)
kids = element.getElementsByTagNameNS(ns, tagName)
if kids:
# record the first one
result = time.strptime(kids[0].childNodes[0].nodeValue,
'%Y-%m-%dT%H:%M:%SZ')
# get rid of all of them
for trash in kids:
trash.parentNode.removeChild(trash)
return result


# given the identifier of an entry in the cache, fetches the
# formated mtime of that entry (which should match the updated
# date if venus has done it's job right
#
# returns None if the entry is not in the cache
def getDateFromCache(entry):
if entry is None:
log.error("Attempted to lookup the date of 'None'")
return None

id = entry.getElementsByTagNameNS(atomNS, 'id')[0].childNodes[0].nodeValue
if id is None:
log.error("Unable to find id in entry")
return None

cache = os.path.join(config.cache_directory())
file = filename(cache, id)
if os.path.exists(file):
return time.gmtime(os.stat(file).st_mtime)
return None


atomNS = 'http://www.w3.org/2005/Atom'
planetNS = 'http://planet.intertwingly.net/'

# parse input stream
dom = minidom.parse(sys.stdin)

entries = dom.getElementsByTagNameNS(atomNS, 'entry')
for e in entries:

# get & remove our dates from the entry
updatedDate = parseAndPurgeDateElement(e, atomNS, 'updated')
pubDate = parseAndPurgeDateElement(e, atomNS, 'published')

cacheDate = getDateFromCache(e)

if cacheDate is not None:
mainDate = cacheDate
elif not updatedDate:
mainDate = pubDate
elif not pubDate:
mainDate = updatedDate
elif pubDate < updatedDate:
mainDate = pubDate
else:
mainDate = updatedDate

# add back to the entry
reconstitute.date(e, 'published', mainDate)
reconstitute.date(e, 'updated', mainDate)

# output the dom
print dom.toxml('utf-8')
9 changes: 9 additions & 0 deletions planet/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,15 @@ def writeCache(feed_uri, feed_info, data):
if os.path.exists(cache_file): os.remove(cache_file)
continue

# re-set mtime incase filters have modified it
try:
edoc = feedparser.parse(output)
mtime = calendar.timegm(edoc.entries[0].updated_parsed)
except:
log.warning("Unable to re-set mtime on %s after running filters: ",
entry.id,
sys.exc_info()[0])

# write out and timestamp the results
write(output, cache_file, mtime)

Expand Down
5 changes: 3 additions & 2 deletions planet/vendor/feedparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1982,6 +1982,7 @@ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0
sValue = bNormalize and self.normalize(sValue) or sValue.strip()
if (not sValue) and (iPropertyType == self.URI):
if sNodeName == 'a': sValue = elmResult.get('href')
elif sNodeName == 'iframe': sValue = elmResult.get('src')
elif sNodeName == 'img': sValue = elmResult.get('src')
elif sNodeName == 'object': sValue = elmResult.get('data')
if sValue:
Expand Down Expand Up @@ -2339,7 +2340,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
'iframe', 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
Expand All @@ -2355,7 +2356,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
Expand Down
4 changes: 2 additions & 2 deletions planet/vendor/html5lib/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class HTMLSanitizerMixin(object):
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd',
'h5', 'h6', 'hr', 'i', 'iframe', 'img', 'input', 'ins', 'keygen', 'kbd',
'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol',
'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre',
'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound',
Expand Down Expand Up @@ -43,7 +43,7 @@ class HTMLSanitizerMixin(object):
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
'face', 'for', 'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers',
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
Expand Down
42 changes: 42 additions & 0 deletions tests/data/filter/coercedates/a-rss-1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<title>Fake RSS Blog</title>
<link>http://fake.url.example.com</link>
<description>Fake RSS Feed For testing</description>
<image>
<url>http://fake.url.example.com/feedlogo.gif</url>
<title>Test RSS Feed</title>
<link>http://fake.url.example.com</link>
</image>
<language>en-us</language>
<copyright>Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution.</copyright>
<generator>Blogsmith http://www.blogsmith.com/</generator>

<item>
<title>Fake Title: RSS Has No Date</title>
<link>http://fake.url.example.com/rss-no-date</link>
<guid isPermaLink="true">http://fake.url.example.com/rss-no-date</guid>
<comments>http://fake.url.example.com/rss-no-date#comments</comments>
<description>
<![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
</description>
<imageurl>http://fake.url.example.com/rss-no-date.gif</imageurl>
<dc:creator>Fake Person</dc:creator>
</item>


<item>
<title>Fake Title: RSS Has Changing Date</title>
<link>http://fake.url.example.com/rss-changing-date</link>
<guid isPermaLink="true">http://fake.url.example.com/rss-changing-date</guid>
<comments>http://fake.url.example.com/rss-changing-date#comments</comments>
<description>
<![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
</description>
<imageurl>http://fake.url.example.com/rss-changing-date.gif</imageurl>
<dc:creator>Fake Person</dc:creator>
<dc:date>2011-12-01T11:00:00+00:00</dc:date>
</item>

</channel></rss>
42 changes: 42 additions & 0 deletions tests/data/filter/coercedates/a-rss-2.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<title>Fake RSS Blog</title>
<link>http://fake.url.example.com</link>
<description>Fake RSS Feed For testing</description>
<image>
<url>http://fake.url.example.com/feedlogo.gif</url>
<title>Test RSS Feed</title>
<link>http://fake.url.example.com</link>
</image>
<language>en-us</language>
<copyright>Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution.</copyright>
<generator>Blogsmith http://www.blogsmith.com/</generator>

<item>
<title>Fake Title: RSS Has No Date</title>
<link>http://fake.url.example.com/rss-no-date</link>
<guid isPermaLink="true">http://fake.url.example.com/rss-no-date</guid>
<comments>http://fake.url.example.com/rss-no-date#comments</comments>
<description>
<![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
</description>
<imageurl>http://fake.url.example.com/rss-no-date.gif</imageurl>
<dc:creator>Fake Person</dc:creator>
</item>


<item>
<title>Fake Title: RSS Has Changing Date</title>
<link>http://fake.url.example.com/rss-changing-date</link>
<guid isPermaLink="true">http://fake.url.example.com/rss-changing-date</guid>
<comments>http://fake.url.example.com/rss-changing-date#comments</comments>
<description>
<![CDATA[<p>Blah Blah Blah something poinient blah blah blah</p>]]>
</description>
<imageurl>http://fake.url.example.com/rss-changing-date.gif</imageurl>
<dc:creator>Fake Person</dc:creator>
<dc:date>2011-12-07T11:07:07+00:00</dc:date>
</item>

</channel></rss>
92 changes: 92 additions & 0 deletions tests/data/filter/coercedates/b-atom-1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<feed
xmlns="http://www.w3.org/2005/Atom"
xmlns:thr="http://purl.org/syndication/thread/1.0"
xml:lang="en"
xml:base="http://fake.url.example.com/wp-atom.php"
>
<title type="text">Fake Atom Feed</title>
<subtitle type="text">Fake Atom feed for testing stuff</subtitle>

<updated>2011-12-08T00:00:28Z</updated>

<link rel="alternate" type="text/html" href="http://fake.url.example.com" />
<id>http://fake.url.example.com/feed/atom/</id>
<link rel="self" type="application/atom+xml" href="http://fake.url.example.com/feed/atom/" />

<generator uri="http://wordpress.org/" version="3.2.1">WordPress</generator>




<entry>
<author>
<name>Fake Person</name>
<uri>http://fake.url.example.com</uri>
</author>
<title type="html"><![CDATA[Atom Changing Updated Date]]></title>
<link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-updated" />
<id>http://fake.url.example.com/atom-changing-updated</id>
<updated>2011-12-05T10:06:38Z</updated>
<published>2011-11-09T00:00:28Z</published>
<summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
<content type="html" xml:base="http://fake.url.example.com/atom-changing-updated"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
<link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-updated#comments" thr:count="0"/>
<link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-updated/feed/atom/" thr:count="0"/>
<thr:total>0</thr:total>
</entry>



<entry>
<author>
<name>Fake Person</name>
<uri>http://fake.url.example.com</uri>
</author>
<title type="html"><![CDATA[Atom Changing Published Date]]></title>
<link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-changing-published" />
<id>http://fake.url.example.com/atom-changing-published</id>
<published>2011-12-08T02:02:28Z</published>
<summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
<content type="html" xml:base="http://fake.url.example.com/atom-changing-published"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
<link rel="replies" type="text/html" href="http://fake.url.example.com/atom-changing-published#comments" thr:count="0"/>
<link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-changing-published/feed/atom/" thr:count="0"/>
<thr:total>0</thr:total>
</entry>


<entry>
<author>
<name>Fake Person</name>
<uri>http://fake.url.example.com</uri>
</author>
<title type="html"><![CDATA[Atom No Date]]></title>
<link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-no-date" />
<id>http://fake.url.example.com/atom-no-date</id>
<summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
<content type="html" xml:base="http://fake.url.example.com/atom-no-date"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
<link rel="replies" type="text/html" href="http://fake.url.example.com/atom-no-date#comments" thr:count="0"/>
<link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-no-date/feed/atom/" thr:count="0"/>
<thr:total>0</thr:total>
</entry>

<entry>
<author>
<name>Fake Person</name>
<uri>http://fake.url.example.com</uri>
</author>
<title type="html"><![CDATA[Atom Update Before Published]]></title>
<updated>2011-11-11T11:11:11Z</updated>
<published>2011-12-12T12:12:12Z</published>
<link rel="alternate" type="text/html" href="http://fake.url.example.com/atom-update-before-pub" />
<id>http://fake.url.example.com/atom-update-before-pub</id>
<summary type="html"><![CDATA[Blah Blah Blah [...]]]></summary>
<content type="html" xml:base="http://fake.url.example.com/atom-update-before-pub"><![CDATA[Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah Blah]]></content>
<link rel="replies" type="text/html" href="http://fake.url.example.com/atom-update-before-pub#comments" thr:count="0"/>
<link rel="replies" type="application/atom+xml" href="http://fake.url.example.com/atom-update-before-pub/feed/atom/" thr:count="0"/>
<thr:total>0</thr:total>
</entry>



</feed>
Loading