Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/initializers/inflections.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,7 @@
# ActiveSupport::Inflector.inflections(:en) do |inflect|
# inflect.acronym "RESTful"
# end

ActiveSupport::Inflector.inflections(:en) do |inflect|
inflect.acronym 'RSS'
end
84 changes: 84 additions & 0 deletions lib/ingestors/dublin_core_ingestion.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
module Ingestors
module DublinCoreIngestion
def build_material_from_dublin_core_data(dc)
material = OpenStruct.new

material.title = dc[:title]
material.description = convert_description(dc[:description])
material.authors = normalize_dublin_core_values(dc[:creators])
material.contributors = normalize_dublin_core_values(dc[:contributors])

rights = normalize_dublin_core_values(dc[:rights])
material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified'

parsed_dates = parse_dublin_core_dates(dc[:dates])
material.date_created = parsed_dates.first
material.date_modified = parsed_dates.last if parsed_dates.size > 1

identifiers = normalize_dublin_core_values(dc[:identifiers])
material.doi = extract_dublin_core_doi(identifiers)
material.url = identifiers.find { |id| id.start_with?('http://', 'https://') }

material.keywords = normalize_dublin_core_values(dc[:subjects])
material.resource_type = normalize_dublin_core_values(dc[:types])
material.contact = dublin_core_text(dc[:publisher])

material
end

def build_event_from_dublin_core_data(dc)
event = OpenStruct.new

event.title = dc[:title]
event.description = convert_description(dc[:description])
event.organizer = normalize_dublin_core_values(dc[:creators]).first
event.contact = dublin_core_text(dc[:publisher]) || event.organizer
event.keywords = normalize_dublin_core_values(dc[:subjects])
event.event_types = normalize_dublin_core_values(dc[:types])

dates = parse_dublin_core_dates(dc[:dates])
event.start = dates.first
event.end = dates.last || dates.first

identifiers = normalize_dublin_core_values(dc[:identifiers])
event.url = identifiers.find { |id| id.start_with?('http://', 'https://') }

event
end

def parse_dublin_core_dates(dates)
normalize_dublin_core_values(dates).map do |date_value|
Date.parse(date_value)
rescue StandardError
nil
Comment on lines +52 to +53
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this rescue be more specific?

end.compact
end

def extract_dublin_core_doi(identifiers)
doi = normalize_dublin_core_values(identifiers).find do |id|
id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/')
end
return nil unless doi

normalized = doi.sub(%r{https?://doi\.org/}, '')
"https://doi.org/#{normalized}"
end

def normalize_dublin_core_values(values)
Array(values).map { |v| dublin_core_text(v) }
.map(&:to_s)
.map(&:strip)
Comment on lines +68 to +70
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need to map 3 times, just do dublin_core_text(v).to_s.strip in the first block

.reject(&:blank?)
.uniq
end

# this method is also used by RSS ingestion under an alias
def dublin_core_text(value)
return nil if value.nil?
return value.content if value.respond_to?(:content) # rss gem xml nodes
return value.text if value.respond_to?(:text) && !value.is_a?(String) # Nokogiri xml nodes

value.to_s
end
end
end
98 changes: 98 additions & 0 deletions lib/ingestors/event_rss_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
require 'rss'
require 'tess_rdf_extractors'

module Ingestors
class EventRSSIngestor < Ingestor
include RSSIngestion

def initialize
super

@bioschemas_manager = BioschemasIngestor.new
end

def self.config
{
key: 'event_rss',
title: 'RSS / Atom Feed',
category: :events
}
end

def read(url)
feed, content, source_url = fetch_feed(url)
return if feed.nil?

if feed.is_a?(RSS::Rss)
@messages << "Parsing RSS feed: #{feed_title(feed)}"
feed.items.each { |item| add_event(build_event_from_rss_item(item, source_url)) }
elsif feed.is_a?(RSS::RDF)
@messages << "Parsing RSS-RDF feed: #{feed_title(feed)}"
rss_events = feed.items.map { |item| build_event_from_rss_item(item, source_url).to_h }
bioschemas_events = extract_rdf_bioschemas_events(content)
merge_with_bioschemas_priority(bioschemas_events, rss_events).each do |event|
add_event(event)
end
elsif feed.is_a?(RSS::Atom::Feed)
@messages << "Parsing ATOM feed: #{feed_title(feed)}"
feed.items.each { |item| add_event(build_event_from_atom_item(item, source_url)) }
else
@messages << "Parsing UNKNOWN feed: #{feed_title(feed)}"
@messages << 'unsupported feed format'
end
Comment on lines +39 to +42
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What other types of feed are there? Perhaps log what the feed was.

end

private

def extract_rdf_bioschemas_events(content)
return [] unless content.present?

events = Tess::Rdf::EventExtractor.new(content, :rdfxml).extract do |params|
@bioschemas_manager.convert_params(params)
end
courses = Tess::Rdf::CourseExtractor.new(content, :rdfxml).extract do |params|
@bioschemas_manager.convert_params(params)
end
course_instances = Tess::Rdf::CourseInstanceExtractor.new(content, :rdfxml).extract do |params|
@bioschemas_manager.convert_params(params)
end

@bioschemas_manager.deduplicate(events + courses + course_instances)
rescue StandardError => e
Rails.logger.error("#{e.class}: #{e.message}")
Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any?
@messages << 'An error occurred while extracting Bioschemas Events.'
[]
end

def build_event_from_rss_item(item, feed_url)
event = build_event_from_dublin_core_data(extract_dublin_core(item))

event.title ||= text_value(item.title)
native_url = resolve_feed_url(item.link, feed_url)
event.url = native_url if native_url.present?
event.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded))
event.keywords = merge_unique(event.keywords, extract_rss_keywords(item))
organizer = text_value(item.respond_to?(:author) ? item.author : nil)
event.organizer ||= organizer
event.contact ||= organizer

event
end

def build_event_from_atom_item(item, feed_url)
event = build_event_from_dublin_core_data(extract_dublin_core(item))

event.title ||= text_value(item.title)
native_url = resolve_feed_url(extract_atom_link(item), feed_url)
event.url = native_url if native_url.present?
event.description ||= convert_description(text_value(item.summary) || text_value(item.content))
event.keywords = merge_unique(event.keywords, extract_atom_keywords(item))
organizer = extract_atom_authors(item).first
event.organizer ||= organizer
event.contact ||= organizer

event
end
end
end
2 changes: 2 additions & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def self.ingestors
Ingestors::ZenodoIngestor,
Ingestors::OaiPmhIngestor,
Ingestors::GithubIngestor,
Ingestors::EventRSSIngestor,
Ingestors::MaterialRSSIngestor
] + taxila_ingestors + llm_ingestors + heptraining_ingestors
end

Expand Down
111 changes: 111 additions & 0 deletions lib/ingestors/material_rss_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
require 'rss'
require_relative '../rss/media'
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed? Should be able to autoload

require 'tess_rdf_extractors'

module Ingestors
class MaterialRSSIngestor < Ingestor
include RSSIngestion

def initialize
super

@bioschemas_manager = BioschemasIngestor.new
end

def self.config
{
key: 'material_rss',
title: 'RSS / Atom Feed',
category: :materials
}
end

def read(url)
feed, content, source_url = fetch_feed(url)
return if feed.nil?

if feed.is_a?(RSS::Rss)
@messages << "Parsing RSS feed: #{feed_title(feed)}"
feed.items.each { |item| add_material(build_material_from_rss_item(item, source_url)) }
elsif feed.is_a?(RSS::RDF)
@messages << "Parsing RSS-RDF feed: #{feed_title(feed)}"
rss_materials = feed.items.map { |item| build_material_from_rss_item(item, source_url).to_h }
bioschemas_materials = extract_rdf_bioschemas_materials(content)
merge_with_bioschemas_priority(bioschemas_materials, rss_materials).each do |material|
add_material(material)
end
elsif feed.is_a?(RSS::Atom::Feed)
@messages << "Parsing ATOM feed: #{feed_title(feed)}"
feed.items.each { |item| add_material(build_material_from_atom_item(item, source_url)) }
else
@messages << "Parsing UNKNOWN feed: #{feed_title(feed)}"
@messages << 'unsupported feed format'
end
end

private

def extract_rdf_bioschemas_materials(content)
return [] unless content.present?

materials = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |params|
@bioschemas_manager.convert_params(params)
end

@bioschemas_manager.deduplicate(materials)
rescue StandardError => e
Rails.logger.error("#{e.class}: #{e.message}")
Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any?
@messages << 'An error occurred while extracting Bioschemas LearningResources.'
[]
end

def build_material_from_rss_item(item, feed_url)
material = build_material_from_dublin_core_data(extract_dublin_core(item))

material.title ||= text_value(item.title)
native_url = resolve_feed_url(item.link, feed_url)
material.url = native_url if native_url.present?
itunes_summary = text_value(item.itunes_summary) if item.respond_to?(:itunes_summary)
material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded) || itunes_summary)
material.keywords = merge_unique(material.keywords, extract_rss_keywords(item))
author = item.author if item.respond_to?(:author)
itunes_author = item.itunes_author if item.respond_to?(:itunes_author)
material.authors = merge_unique(material.authors, [text_value(author)] + [text_value(itunes_author)].compact)
material.contact ||= material.authors&.first
guid = item.guid if item.respond_to?(:guid)
material.doi ||= extract_dublin_core_doi([text_value(guid)])

item_date = parse_time(item.pubDate) if item.respond_to?(:pubDate)
item_date ||= parse_time(item.date) if item.respond_to?(:date)
material.date_published ||= item_date
material.date_created = prefer_precise_time(material.date_created, item_date)
material.date_modified = prefer_precise_time(material.date_modified, parse_time(item.date)) if item.respond_to?(:date)

material
end

def build_material_from_atom_item(item, feed_url)
material = build_material_from_dublin_core_data(extract_dublin_core(item))

media_title = text_value(item.media_group&.media_title)
material.title ||= text_value(item.title) || media_title
native_url = resolve_feed_url(extract_atom_link(item), feed_url)
material.url = native_url if native_url.present?
media_group_description = text_value(item.media_group&.media_description)
material.description ||= convert_description(text_value(item.summary) || text_value(item.content) || media_group_description)
material.keywords = merge_unique(material.keywords, extract_atom_keywords(item))
material.authors = merge_unique(material.authors, extract_atom_authors(item))
material.contact ||= material.authors&.first
material.doi ||= extract_dublin_core_doi([text_value(item.id)])

published = parse_time(item.published)
updated = parse_time(item.updated)
material.date_created = prefer_precise_time(material.date_created, published)
material.date_published ||= published || updated
material.date_modified = prefer_precise_time(material.date_modified, updated)

material
end
end
end
Loading
Loading