-
Notifications
You must be signed in to change notification settings - Fork 19
RSS ingestor #1276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
RSS ingestor #1276
Changes from all commits
5af12ee
e36dbc2
be76ff7
8c2880b
54895a2
3cea73b
2c7c05e
a515e46
cd91db6
b8f19c6
b2780cf
0f042e7
89e5f53
662c450
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| module Ingestors | ||
| module DublinCoreIngestion | ||
| def build_material_from_dublin_core_data(dc) | ||
| material = OpenStruct.new | ||
|
|
||
| material.title = dc[:title] | ||
| material.description = convert_description(dc[:description]) | ||
| material.authors = normalize_dublin_core_values(dc[:creators]) | ||
| material.contributors = normalize_dublin_core_values(dc[:contributors]) | ||
|
|
||
| rights = normalize_dublin_core_values(dc[:rights]) | ||
| material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' | ||
|
|
||
| parsed_dates = parse_dublin_core_dates(dc[:dates]) | ||
| material.date_created = parsed_dates.first | ||
| material.date_modified = parsed_dates.last if parsed_dates.size > 1 | ||
|
|
||
| identifiers = normalize_dublin_core_values(dc[:identifiers]) | ||
| material.doi = extract_dublin_core_doi(identifiers) | ||
| material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } | ||
|
|
||
| material.keywords = normalize_dublin_core_values(dc[:subjects]) | ||
| material.resource_type = normalize_dublin_core_values(dc[:types]) | ||
| material.contact = dublin_core_text(dc[:publisher]) | ||
|
|
||
| material | ||
| end | ||
|
|
||
| def build_event_from_dublin_core_data(dc) | ||
| event = OpenStruct.new | ||
|
|
||
| event.title = dc[:title] | ||
| event.description = convert_description(dc[:description]) | ||
| event.organizer = normalize_dublin_core_values(dc[:creators]).first | ||
| event.contact = dublin_core_text(dc[:publisher]) || event.organizer | ||
| event.keywords = normalize_dublin_core_values(dc[:subjects]) | ||
| event.event_types = normalize_dublin_core_values(dc[:types]) | ||
|
|
||
| dates = parse_dublin_core_dates(dc[:dates]) | ||
| event.start = dates.first | ||
| event.end = dates.last || dates.first | ||
|
|
||
| identifiers = normalize_dublin_core_values(dc[:identifiers]) | ||
| event.url = identifiers.find { |id| id.start_with?('http://', 'https://') } | ||
|
|
||
| event | ||
| end | ||
|
|
||
| def parse_dublin_core_dates(dates) | ||
| normalize_dublin_core_values(dates).map do |date_value| | ||
| Date.parse(date_value) | ||
| rescue StandardError | ||
| nil | ||
| end.compact | ||
| end | ||
|
|
||
| def extract_dublin_core_doi(identifiers) | ||
| doi = normalize_dublin_core_values(identifiers).find do |id| | ||
| id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') | ||
| end | ||
| return nil unless doi | ||
|
|
||
| normalized = doi.sub(%r{https?://doi\.org/}, '') | ||
| "https://doi.org/#{normalized}" | ||
| end | ||
|
|
||
| def normalize_dublin_core_values(values) | ||
| Array(values).map { |v| dublin_core_text(v) } | ||
| .map(&:to_s) | ||
| .map(&:strip) | ||
|
Comment on lines
+68
to
+70
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need to |
||
| .reject(&:blank?) | ||
| .uniq | ||
| end | ||
|
|
||
| # this method is also used by RSS ingestion under an alias | ||
| def dublin_core_text(value) | ||
| return nil if value.nil? | ||
| return value.content if value.respond_to?(:content) # rss gem xml nodes | ||
| return value.text if value.respond_to?(:text) && !value.is_a?(String) # Nokogiri xml nodes | ||
|
|
||
| value.to_s | ||
| end | ||
| end | ||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| require 'rss' | ||
| require 'tess_rdf_extractors' | ||
|
|
||
| module Ingestors | ||
| class EventRSSIngestor < Ingestor | ||
| include RSSIngestion | ||
|
|
||
| def initialize | ||
| super | ||
|
|
||
| @bioschemas_manager = BioschemasIngestor.new | ||
| end | ||
|
|
||
| def self.config | ||
| { | ||
| key: 'event_rss', | ||
| title: 'RSS / Atom Feed', | ||
| category: :events | ||
| } | ||
| end | ||
|
|
||
| def read(url) | ||
| feed, content, source_url = fetch_feed(url) | ||
| return if feed.nil? | ||
|
|
||
| if feed.is_a?(RSS::Rss) | ||
| @messages << "Parsing RSS feed: #{feed_title(feed)}" | ||
| feed.items.each { |item| add_event(build_event_from_rss_item(item, source_url)) } | ||
| elsif feed.is_a?(RSS::RDF) | ||
| @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" | ||
| rss_events = feed.items.map { |item| build_event_from_rss_item(item, source_url).to_h } | ||
| bioschemas_events = extract_rdf_bioschemas_events(content) | ||
| merge_with_bioschemas_priority(bioschemas_events, rss_events).each do |event| | ||
| add_event(event) | ||
| end | ||
| elsif feed.is_a?(RSS::Atom::Feed) | ||
| @messages << "Parsing ATOM feed: #{feed_title(feed)}" | ||
| feed.items.each { |item| add_event(build_event_from_atom_item(item, source_url)) } | ||
| else | ||
| @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" | ||
| @messages << 'unsupported feed format' | ||
| end | ||
|
Comment on lines
+39
to
+42
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What other types of feed are there? Perhaps log what the feed was. |
||
| end | ||
|
|
||
| private | ||
|
|
||
| def extract_rdf_bioschemas_events(content) | ||
| return [] unless content.present? | ||
|
|
||
| events = Tess::Rdf::EventExtractor.new(content, :rdfxml).extract do |params| | ||
| @bioschemas_manager.convert_params(params) | ||
| end | ||
| courses = Tess::Rdf::CourseExtractor.new(content, :rdfxml).extract do |params| | ||
| @bioschemas_manager.convert_params(params) | ||
| end | ||
| course_instances = Tess::Rdf::CourseInstanceExtractor.new(content, :rdfxml).extract do |params| | ||
| @bioschemas_manager.convert_params(params) | ||
| end | ||
|
|
||
| @bioschemas_manager.deduplicate(events + courses + course_instances) | ||
| rescue StandardError => e | ||
| Rails.logger.error("#{e.class}: #{e.message}") | ||
| Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? | ||
| @messages << 'An error occurred while extracting Bioschemas Events.' | ||
| [] | ||
| end | ||
|
|
||
| def build_event_from_rss_item(item, feed_url) | ||
| event = build_event_from_dublin_core_data(extract_dublin_core(item)) | ||
|
|
||
| event.title ||= text_value(item.title) | ||
| native_url = resolve_feed_url(item.link, feed_url) | ||
| event.url = native_url if native_url.present? | ||
| event.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded)) | ||
| event.keywords = merge_unique(event.keywords, extract_rss_keywords(item)) | ||
| organizer = text_value(item.respond_to?(:author) ? item.author : nil) | ||
| event.organizer ||= organizer | ||
| event.contact ||= organizer | ||
|
|
||
| event | ||
| end | ||
|
|
||
| def build_event_from_atom_item(item, feed_url) | ||
| event = build_event_from_dublin_core_data(extract_dublin_core(item)) | ||
|
|
||
| event.title ||= text_value(item.title) | ||
| native_url = resolve_feed_url(extract_atom_link(item), feed_url) | ||
| event.url = native_url if native_url.present? | ||
| event.description ||= convert_description(text_value(item.summary) || text_value(item.content)) | ||
| event.keywords = merge_unique(event.keywords, extract_atom_keywords(item)) | ||
| organizer = extract_atom_authors(item).first | ||
| event.organizer ||= organizer | ||
| event.contact ||= organizer | ||
|
|
||
| event | ||
| end | ||
| end | ||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,111 @@ | ||
| require 'rss' | ||
| require_relative '../rss/media' | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this needed? Should be able to autoload |
||
| require 'tess_rdf_extractors' | ||
|
|
||
| module Ingestors | ||
| class MaterialRSSIngestor < Ingestor | ||
| include RSSIngestion | ||
|
|
||
| def initialize | ||
| super | ||
|
|
||
| @bioschemas_manager = BioschemasIngestor.new | ||
| end | ||
|
|
||
| def self.config | ||
| { | ||
| key: 'material_rss', | ||
| title: 'RSS / Atom Feed', | ||
| category: :materials | ||
| } | ||
| end | ||
|
|
||
| def read(url) | ||
| feed, content, source_url = fetch_feed(url) | ||
| return if feed.nil? | ||
|
|
||
| if feed.is_a?(RSS::Rss) | ||
| @messages << "Parsing RSS feed: #{feed_title(feed)}" | ||
| feed.items.each { |item| add_material(build_material_from_rss_item(item, source_url)) } | ||
| elsif feed.is_a?(RSS::RDF) | ||
| @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" | ||
| rss_materials = feed.items.map { |item| build_material_from_rss_item(item, source_url).to_h } | ||
| bioschemas_materials = extract_rdf_bioschemas_materials(content) | ||
| merge_with_bioschemas_priority(bioschemas_materials, rss_materials).each do |material| | ||
| add_material(material) | ||
| end | ||
| elsif feed.is_a?(RSS::Atom::Feed) | ||
| @messages << "Parsing ATOM feed: #{feed_title(feed)}" | ||
| feed.items.each { |item| add_material(build_material_from_atom_item(item, source_url)) } | ||
| else | ||
| @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" | ||
| @messages << 'unsupported feed format' | ||
| end | ||
| end | ||
|
|
||
| private | ||
|
|
||
| def extract_rdf_bioschemas_materials(content) | ||
| return [] unless content.present? | ||
|
|
||
| materials = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |params| | ||
| @bioschemas_manager.convert_params(params) | ||
| end | ||
|
|
||
| @bioschemas_manager.deduplicate(materials) | ||
| rescue StandardError => e | ||
| Rails.logger.error("#{e.class}: #{e.message}") | ||
| Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? | ||
| @messages << 'An error occurred while extracting Bioschemas LearningResources.' | ||
| [] | ||
| end | ||
|
|
||
| def build_material_from_rss_item(item, feed_url) | ||
| material = build_material_from_dublin_core_data(extract_dublin_core(item)) | ||
|
|
||
| material.title ||= text_value(item.title) | ||
| native_url = resolve_feed_url(item.link, feed_url) | ||
| material.url = native_url if native_url.present? | ||
| itunes_summary = text_value(item.itunes_summary) if item.respond_to?(:itunes_summary) | ||
| material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded) || itunes_summary) | ||
| material.keywords = merge_unique(material.keywords, extract_rss_keywords(item)) | ||
| author = item.author if item.respond_to?(:author) | ||
| itunes_author = item.itunes_author if item.respond_to?(:itunes_author) | ||
| material.authors = merge_unique(material.authors, [text_value(author)] + [text_value(itunes_author)].compact) | ||
| material.contact ||= material.authors&.first | ||
| guid = item.guid if item.respond_to?(:guid) | ||
| material.doi ||= extract_dublin_core_doi([text_value(guid)]) | ||
|
|
||
| item_date = parse_time(item.pubDate) if item.respond_to?(:pubDate) | ||
| item_date ||= parse_time(item.date) if item.respond_to?(:date) | ||
| material.date_published ||= item_date | ||
| material.date_created = prefer_precise_time(material.date_created, item_date) | ||
| material.date_modified = prefer_precise_time(material.date_modified, parse_time(item.date)) if item.respond_to?(:date) | ||
|
|
||
| material | ||
| end | ||
|
|
||
| def build_material_from_atom_item(item, feed_url) | ||
| material = build_material_from_dublin_core_data(extract_dublin_core(item)) | ||
|
|
||
| media_title = text_value(item.media_group&.media_title) | ||
| material.title ||= text_value(item.title) || media_title | ||
| native_url = resolve_feed_url(extract_atom_link(item), feed_url) | ||
| material.url = native_url if native_url.present? | ||
| media_group_description = text_value(item.media_group&.media_description) | ||
| material.description ||= convert_description(text_value(item.summary) || text_value(item.content) || media_group_description) | ||
| material.keywords = merge_unique(material.keywords, extract_atom_keywords(item)) | ||
| material.authors = merge_unique(material.authors, extract_atom_authors(item)) | ||
| material.contact ||= material.authors&.first | ||
| material.doi ||= extract_dublin_core_doi([text_value(item.id)]) | ||
|
|
||
| published = parse_time(item.published) | ||
| updated = parse_time(item.updated) | ||
| material.date_created = prefer_precise_time(material.date_created, published) | ||
| material.date_published ||= published || updated | ||
| material.date_modified = prefer_precise_time(material.date_modified, updated) | ||
|
|
||
| material | ||
| end | ||
| end | ||
| end | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this rescue be more specific?