News Analysis Example¶
Extract and track facts from news articles over time.
Scenario¶
A news monitoring system that extracts facts from articles and tracks how information about companies and people changes over time.
Setup¶
require 'fact_db'
FactDb.configure do |config|
config.database.url = ENV['DATABASE_URL']
config.llm.provider = :openai
config.llm.api_key = ENV['OPENAI_API_KEY']
end
facts = FactDb.new
Ingest News Articles¶
# Article 1: CEO Announcement
article1 = facts.ingest(
<<~TEXT,
TechCorp Appoints New CEO
San Francisco, Jan 15, 2024 - TechCorp announced today that
Jane Williams has been appointed as Chief Executive Officer,
effective immediately. Williams previously served as COO at
InnovateTech for 8 years.
"We are thrilled to welcome Jane to lead TechCorp into its
next chapter," said Board Chairman Robert Chen.
Williams succeeds Michael Johnson, who is retiring after
15 years at the helm.
TEXT
type: :article,
title: "TechCorp Appoints New CEO",
source_uri: "https://news.example.com/techcorp-new-ceo",
captured_at: Date.parse("2024-01-15"),
metadata: {
source: "Tech News Daily",
author: "Sarah Reporter",
category: "Business"
}
)
# Article 2: Earnings Report
article2 = facts.ingest(
<<~TEXT,
TechCorp Reports Record Q4 Earnings
San Francisco, Feb 1, 2024 - TechCorp reported quarterly
revenue of $5.2 billion, up 23% year-over-year. Net income
reached $800 million.
"Our cloud division continues to drive growth," said CEO
Jane Williams in her first earnings call since taking over.
The company also announced plans to acquire DataFlow Inc
for $1.2 billion, expected to close in Q2 2024.
TEXT
type: :article,
title: "TechCorp Reports Record Q4 Earnings",
source_uri: "https://news.example.com/techcorp-q4-earnings",
captured_at: Date.parse("2024-02-01"),
metadata: { source: "Financial Times", category: "Earnings" }
)
# Article 3: Acquisition Update
article3 = facts.ingest(
<<~TEXT,
TechCorp-DataFlow Deal Falls Through
San Francisco, Apr 15, 2024 - TechCorp announced it has
terminated its planned acquisition of DataFlow Inc, citing
regulatory concerns.
"After careful consideration, we have decided not to proceed
with the acquisition," said TechCorp CEO Jane Williams.
TEXT
type: :article,
title: "TechCorp-DataFlow Deal Falls Through",
source_uri: "https://news.example.com/techcorp-dataflow-cancelled",
captured_at: Date.parse("2024-04-15"),
metadata: { source: "Business Wire", category: "M&A" }
)
Extract Facts with LLM¶
# Process all articles
[article1, article2, article3].each do |article|
puts "Processing: #{article.title}"
extracted = facts.extract_facts(article.id, extractor: :llm)
puts " Extracted #{extracted.count} facts"
end
Review Extracted Entities¶
# List all extracted entities
puts "\nExtracted Entities:"
FactDb::Models::Entity.all.each do |entity|
puts " #{entity.name} (#{entity.type})"
end
Query Facts by Topic¶
# CEO-related facts
puts "\nCEO Facts:"
facts.query_facts(topic: "CEO").each do |fact|
puts " #{fact.valid_at.to_date}: #{fact.text}"
end
# Acquisition facts
puts "\nAcquisition Facts:"
facts.query_facts(topic: "acquisition").each do |fact|
puts " #{fact.valid_at.to_date}: #{fact.text}"
end
Track Entity Over Time¶
# Find TechCorp entity
techcorp = facts.resolve_entity("TechCorp", type: :organization)
# Timeline of TechCorp facts
puts "\nTechCorp Timeline:"
facts.timeline_for(techcorp.id).each do |fact|
source = fact.fact_sources.first&.source&.title || "Unknown"
puts " #{fact.valid_at.to_date}: #{fact.text}"
puts " Source: #{source}"
end
Handle Superseded Information¶
# The acquisition fact from article2 should be superseded by article3
# Find the original acquisition fact
acquisition_fact = FactDb::Models::Fact
.search_text("acquire DataFlow")
.canonical
.first
if acquisition_fact
# Supersede with cancelled status
facts.fact_service.resolver.supersede(
acquisition_fact.id,
"TechCorp cancelled its planned acquisition of DataFlow Inc",
valid_at: Date.parse("2024-04-15")
)
puts "\nAcquisition status updated:"
puts " Original: #{acquisition_fact.reload.text} (#{acquisition_fact.status})"
puts " Updated: #{acquisition_fact.superseded_by.text}"
end
Corroborate Facts¶
# If multiple articles confirm the same fact
ceo_facts = FactDb::Models::Fact
.search_text("Jane Williams CEO")
.canonical
.to_a
if ceo_facts.count > 1
primary = ceo_facts.first
ceo_facts[1..].each do |corroborating|
facts.fact_service.resolver.corroborate(primary.id, corroborating.id)
end
puts "\nCEO fact corroborated by #{ceo_facts.count} sources"
end
Generate Company Report¶
def company_report(facts, company_name)
company = facts.resolve_entity(company_name, type: :organization)
return nil unless company
current_facts = facts.current_facts_for(company.id)
{
company: company.name,
current_facts: current_facts.map(&:text),
leadership: extract_leadership(current_facts),
timeline: facts.timeline_for(company.id).map { |f|
{
date: f.valid_at,
fact: f.text,
source: f.fact_sources.first&.source&.title
}
}
}
end
def extract_leadership(facts)
leadership = {}
facts.each do |fact|
if fact.text =~ /CEO/
leadership[:ceo] = fact.entity_mentions.find { |m| m.mention_role == "subject" }&.entity&.name
end
end
leadership
end
report = company_report(facts, "TechCorp")
puts JSON.pretty_generate(report)
Batch Process News Feed¶
def process_news_feed(facts, articles)
source_ids = articles.map do |article|
source = facts.ingest(
article[:text],
type: :article,
title: article[:title],
source_uri: article[:url],
captured_at: article[:published_at]
)
source.id
end
# Parallel extraction
results = facts.batch_extract(source_ids, extractor: :llm)
{
processed: results.count,
successful: results.count { |r| r[:error].nil? },
total_facts: results.sum { |r| r[:facts].count }
}
end
# Example usage
news_feed = [
{ title: "Article 1", text: "...", url: "...", published_at: Time.now },
{ title: "Article 2", text: "...", url: "...", published_at: Time.now }
]
stats = process_news_feed(facts, news_feed)
puts "Processed #{stats[:processed]} articles, extracted #{stats[:total_facts]} facts"
Monitor Specific Topics¶
def monitor_topic(facts, topic, since: 1.week.ago)
matching = FactDb::Models::Fact
.search_text(topic)
.where("created_at > ?", since)
.order(created_at: :desc)
{
topic: topic,
new_facts: matching.count,
facts: matching.map { |f|
{
text: f.text,
date: f.valid_at,
source: f.fact_sources.first&.source&.title,
entities: f.entity_mentions.map { |m| m.entity.name }
}
}
}
end
# Monitor acquisitions
acquisition_updates = monitor_topic(facts, "acquisition")
puts "Recent acquisition news: #{acquisition_updates[:new_facts]} facts"