Initial untested implementation

This commit is contained in:
Alex Selimov 2025-08-04 15:56:02 -04:00
parent 24200b899a
commit 193a50dd3b
8 changed files with 917 additions and 11 deletions

View file

@ -1,2 +1,28 @@
def main() -> None:
print("Hello from rss2newsletter!")
"""
rss2newsletter - A simple tool for generating HTML newsletters from RSS feeds.
This package provides functionality to:
- Fetch articles from RSS feeds
- Filter articles published today
- Generate AI summaries using Ollama
- Create clean HTML newsletters
"""
__version__ = "0.1.0"
__author__ = "Your Name"
__email__ = "your.email@example.com"
from .rss_fetcher import get_todays_articles
from .ollama_client import create_ollama_client, summarize_articles
from .html_generator import generate_newsletter_html, save_newsletter_html
from .config import get_config_from_env, setup_logging
__all__ = [
"get_todays_articles",
"create_ollama_client",
"summarize_articles",
"generate_newsletter_html",
"save_newsletter_html",
"get_config_from_env",
"setup_logging",
]

View file

@ -0,0 +1,161 @@
"""Main entry point for rss2newsletter."""
import sys
import argparse
import logging
from typing import List, Dict
from .config import get_config_from_env, setup_logging
from .rss_fetcher import get_todays_articles
from .ollama_client import create_ollama_client, summarize_articles
from .html_generator import generate_newsletter_html, save_newsletter_html
logger = logging.getLogger(__name__)
def create_argument_parser() -> argparse.ArgumentParser:
"""Create command line argument parser."""
parser = argparse.ArgumentParser(
description="Generate HTML newsletter from RSS feed using Ollama AI summaries",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python -m rss2newsletter https://feeds.example.com/rss
python -m rss2newsletter https://blog.example.com/feed.xml --output my_newsletter.html
python -m rss2newsletter https://news.example.com/rss --model llama3.1
""",
)
parser.add_argument("rss_url", help="RSS feed URL to process")
parser.add_argument(
"--output",
"-o",
default=None,
help="Output HTML filename (default: newsletter.html)",
)
parser.add_argument(
"--model",
"-m",
default=None,
help="Ollama model to use for summaries (default: llama3.2)",
)
parser.add_argument(
"--ollama-url",
default=None,
help="Ollama server URL (default: http://localhost:11434)",
)
parser.add_argument(
"--title", "-t", default=None, help="Newsletter title (default: RSS Newsletter)"
)
parser.add_argument(
"--verbose", "-v", action="store_true", help="Enable verbose logging"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Fetch articles but don't generate summaries or save HTML",
)
return parser
def merge_config_with_args(config: Dict, args: argparse.Namespace) -> Dict:
"""Merge configuration with command line arguments."""
if args.output:
config["output"]["filename"] = args.output
if args.model:
config["ollama"]["model"] = args.model
if args.ollama_url:
config["ollama"]["base_url"] = args.ollama_url
if args.title:
config["output"]["feed_title"] = args.title
if args.verbose:
config["logging"]["level"] = "DEBUG"
return config
def process_rss_to_newsletter(rss_url: str, config: Dict, dry_run: bool = False) -> str:
"""Main processing pipeline for RSS to newsletter conversion."""
logger.info(f"Starting RSS newsletter generation for: {rss_url}")
# Step 1: Fetch today's articles
logger.info("Fetching articles from RSS feed...")
articles = get_todays_articles(rss_url)
if not articles:
logger.warning("No articles found for today")
return generate_newsletter_html([], config["output"]["feed_title"])
logger.info(f"Found {len(articles)} articles from today")
if dry_run:
logger.info("Dry run mode - stopping before summarization")
for i, article in enumerate(articles, 1):
logger.info(f"Article {i}: {article['title']}")
return ""
# Step 2: Create Ollama client and summarize articles
logger.info("Generating AI summaries...")
ollama_client = create_ollama_client(config["ollama"]["base_url"])
summarized_articles = summarize_articles(
ollama_client, articles, config["ollama"]["model"]
)
# Step 3: Generate HTML newsletter
logger.info("Generating HTML newsletter...")
html_content = generate_newsletter_html(
summarized_articles, config["output"]["feed_title"]
)
return html_content
def main() -> int:
"""Main entry point."""
parser = create_argument_parser()
args = parser.parse_args()
# Load and merge configuration
config = get_config_from_env()
config = merge_config_with_args(config, args)
# Setup logging
setup_logging(config)
try:
# Process RSS feed to newsletter
html_content = process_rss_to_newsletter(args.rss_url, config, args.dry_run)
if not args.dry_run and html_content:
# Save HTML file
output_filename = save_newsletter_html(
html_content, config["output"]["filename"]
)
print(f"Newsletter saved to: {output_filename}")
return 0
except KeyboardInterrupt:
logger.info("Process interrupted by user")
return 1
except Exception as e:
logger.error(f"Error generating newsletter: {e}")
if config["logging"]["level"] == "DEBUG":
logger.exception("Full error details:")
return 1
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,75 @@
"""Configuration management for rss2newsletter."""
import os
from typing import Dict, Any
import logging
logger = logging.getLogger(__name__)
def get_default_config() -> Dict[str, Any]:
"""Get default configuration values."""
return {
"ollama": {
"base_url": "http://localhost:11434",
"model": "llama3.2",
"timeout": 60,
"max_summary_length": 150,
},
"output": {"filename": "newsletter.html", "feed_title": "RSS Newsletter"},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
},
}
def get_config_from_env() -> Dict[str, Any]:
"""Get configuration from environment variables."""
config = get_default_config()
# Ollama configuration
if os.getenv("OLLAMA_BASE_URL"):
config["ollama"]["base_url"] = os.getenv("OLLAMA_BASE_URL")
if os.getenv("OLLAMA_MODEL"):
config["ollama"]["model"] = os.getenv("OLLAMA_MODEL")
if os.getenv("OLLAMA_TIMEOUT"):
try:
config["ollama"]["timeout"] = int(os.getenv("OLLAMA_TIMEOUT"))
except ValueError:
logger.warning("Invalid OLLAMA_TIMEOUT value, using default")
if os.getenv("MAX_SUMMARY_LENGTH"):
try:
config["ollama"]["max_summary_length"] = int(
os.getenv("MAX_SUMMARY_LENGTH")
)
except ValueError:
logger.warning("Invalid MAX_SUMMARY_LENGTH value, using default")
# Output configuration
if os.getenv("OUTPUT_FILENAME"):
config["output"]["filename"] = os.getenv("OUTPUT_FILENAME")
if os.getenv("FEED_TITLE"):
config["output"]["feed_title"] = os.getenv("FEED_TITLE")
# Logging configuration
if os.getenv("LOG_LEVEL"):
config["logging"]["level"] = os.getenv("LOG_LEVEL").upper()
return config
def setup_logging(config: Dict[str, Any]) -> None:
"""Setup logging based on configuration."""
log_level = getattr(logging, config["logging"]["level"], logging.INFO)
log_format = config["logging"]["format"]
logging.basicConfig(level=log_level, format=log_format, datefmt="%Y-%m-%d %H:%M:%S")
# Reduce noise from external libraries
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("requests").setLevel(logging.WARNING)

View file

@ -0,0 +1,227 @@
"""HTML newsletter generation functionality."""
from datetime import datetime
from typing import List, Dict
import html
import logging
logger = logging.getLogger(__name__)
def escape_html(text: str) -> str:
"""Safely escape HTML in text content."""
return html.escape(str(text)) if text else ""
def create_article_html(article: Dict[str, str]) -> str:
"""Generate HTML for a single article."""
title = escape_html(article.get("title", "No Title"))
link = escape_html(article.get("link", ""))
ai_summary = escape_html(article.get("ai_summary", "No summary available"))
return f"""
<div class="article">
<h2 class="article-title">
<a href="{link}" target="_blank">{title}</a>
</h2>
<div class="article-summary">
<p>{ai_summary}</p>
</div>
<div class="article-link">
<a href="{link}" target="_blank" class="read-more">Read full article </a>
</div>
</div>
"""
def create_css_styles() -> str:
"""Generate CSS styles for the newsletter."""
return """
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f9f9f9;
}
.newsletter {
background: white;
border-radius: 8px;
padding: 30px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.header {
text-align: center;
border-bottom: 2px solid #eee;
padding-bottom: 20px;
margin-bottom: 30px;
}
.header h1 {
color: #2c3e50;
margin: 0;
font-size: 2.5em;
}
.header .date {
color: #7f8c8d;
font-size: 1.1em;
margin-top: 10px;
}
.article {
margin-bottom: 40px;
padding-bottom: 30px;
border-bottom: 1px solid #eee;
}
.article:last-child {
border-bottom: none;
margin-bottom: 0;
}
.article-title {
margin: 0 0 15px 0;
font-size: 1.4em;
}
.article-title a {
color: #2980b9;
text-decoration: none;
}
.article-title a:hover {
color: #3498db;
text-decoration: underline;
}
.article-summary {
margin: 15px 0;
background: #f8f9fa;
padding: 15px;
border-radius: 5px;
border-left: 4px solid #3498db;
}
.article-summary p {
margin: 0;
color: #555;
}
.article-link {
margin-top: 15px;
}
.read-more {
color: #e74c3c;
text-decoration: none;
font-weight: 500;
}
.read-more:hover {
text-decoration: underline;
}
.footer {
margin-top: 40px;
padding-top: 20px;
border-top: 1px solid #eee;
text-align: center;
color: #7f8c8d;
font-size: 0.9em;
}
.no-articles {
text-align: center;
color: #7f8c8d;
font-style: italic;
padding: 40px 20px;
}
</style>
"""
def create_header_html(feed_title: str = "RSS Newsletter") -> str:
"""Generate the newsletter header."""
current_date = datetime.now().strftime("%B %d, %Y")
return f"""
<div class="header">
<h1>📰 {escape_html(feed_title)}</h1>
<div class="date">Daily Summary for {current_date}</div>
</div>
"""
def create_footer_html() -> str:
"""Generate the newsletter footer."""
return """
<div class="footer">
<p>Generated by rss2newsletter 🤖</p>
<p>Powered by Ollama AI summaries</p>
</div>
"""
def create_no_articles_html() -> str:
"""Generate HTML for when no articles are found."""
return """
<div class="no-articles">
<h2>📭 No articles found for today</h2>
<p>Check back tomorrow for fresh content!</p>
</div>
"""
def generate_newsletter_html(
articles: List[Dict[str, str]], feed_title: str = "RSS Newsletter"
) -> str:
"""Generate complete HTML newsletter from articles."""
logger.info(f"Generating HTML newsletter with {len(articles)} articles")
# Generate article HTML
if articles:
articles_html = "\n".join(create_article_html(article) for article in articles)
else:
articles_html = create_no_articles_html()
# Combine all parts
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{escape_html(feed_title)} - Daily Summary</title>
{create_css_styles()}
</head>
<body>
<div class="newsletter">
{create_header_html(feed_title)}
{articles_html}
{create_footer_html()}
</div>
</body>
</html>
"""
return html_content
def save_newsletter_html(html_content: str, filename: str = "newsletter.html") -> str:
"""Save HTML content to file and return the filename."""
try:
with open(filename, "w", encoding="utf-8") as f:
f.write(html_content)
logger.info(f"Newsletter saved to: {filename}")
return filename
except IOError as e:
logger.error(f"Error saving newsletter: {e}")
raise

View file

@ -0,0 +1,112 @@
"""Ollama API client for generating summaries."""
import requests
import json
from typing import Dict, Optional
import logging
logger = logging.getLogger(__name__)
def create_ollama_client(base_url: str = "http://localhost:11434") -> Dict[str, str]:
"""Create an Ollama client configuration."""
return {
"base_url": base_url,
"generate_endpoint": f"{base_url}/api/generate",
"chat_endpoint": f"{base_url}/api/chat",
}
def test_ollama_connection(client_config: Dict[str, str]) -> bool:
"""Test if Ollama server is accessible."""
try:
response = requests.get(f"{client_config['base_url']}/api/tags", timeout=5)
return response.status_code == 200
except requests.RequestException as e:
logger.error(f"Failed to connect to Ollama server: {e}")
return False
def generate_summary(
client_config: Dict[str, str],
content: str,
model: str = "llama3.2",
max_length: int = 150,
) -> Optional[str]:
"""Generate a summary of the given content using Ollama."""
if not content.strip():
return "No content available for summarization."
prompt = f"""Please provide a concise summary of the following article in approximately {max_length} words. Focus on the key points and main ideas:
{content}
Summary:"""
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.7,
"top_p": 0.9,
"max_tokens": max_length * 2, # Allow some buffer
},
}
try:
logger.info(f"Generating summary using model: {model}")
response = requests.post(
client_config["generate_endpoint"], json=payload, timeout=60
)
response.raise_for_status()
result = response.json()
summary = result.get("response", "").strip()
if not summary:
logger.warning("Received empty summary from Ollama")
return "Summary could not be generated."
return summary
except requests.RequestException as e:
logger.error(f"Error communicating with Ollama: {e}")
return f"Error generating summary: {str(e)}"
except json.JSONDecodeError as e:
logger.error(f"Error parsing Ollama response: {e}")
return "Error: Invalid response from Ollama server."
def summarize_article(
client_config: Dict[str, str], article: Dict[str, str], model: str = "llama3.2"
) -> Dict[str, str]:
"""Summarize a single article and return enriched article data."""
# Use content if available, otherwise fall back to summary
content_to_summarize = article.get("content") or article.get("summary", "")
# Generate AI summary
ai_summary = generate_summary(client_config, content_to_summarize, model)
# Return enriched article data
return {**article, "ai_summary": ai_summary or "Summary unavailable."}
def summarize_articles(
client_config: Dict[str, str], articles: list, model: str = "llama3.2"
) -> list:
"""Summarize multiple articles using functional approach."""
if not test_ollama_connection(client_config):
logger.error("Cannot connect to Ollama server. Summaries will be unavailable.")
# Return articles with placeholder summaries
return [
{
**article,
"ai_summary": "Summary unavailable - Ollama server not accessible.",
}
for article in articles
]
logger.info(f"Summarizing {len(articles)} articles")
return [summarize_article(client_config, article, model) for article in articles]

View file

@ -0,0 +1,93 @@
"""RSS feed fetching functionality."""
import feedparser
from datetime import datetime, timezone
from dateutil import parser as date_parser
from typing import List, Dict, Optional
import logging
logger = logging.getLogger(__name__)
def fetch_rss_feed(url: str) -> feedparser.FeedParserDict:
"""Fetch and parse RSS feed from URL."""
try:
logger.info(f"Fetching RSS feed from: {url}")
feed = feedparser.parse(url)
if feed.bozo:
logger.warning(f"Feed parsing warning: {feed.bozo_exception}")
return feed
except Exception as e:
logger.error(f"Error fetching RSS feed: {e}")
raise
def is_today(entry_date: Optional[str]) -> bool:
"""Check if an entry was published today."""
if not entry_date:
return False
try:
# Parse the entry date
parsed_date = date_parser.parse(entry_date)
# Make sure it's timezone-aware
if parsed_date.tzinfo is None:
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
# Get today's date in UTC
today = datetime.now(timezone.utc).date()
return parsed_date.date() == today
except Exception as e:
logger.warning(f"Error parsing date '{entry_date}': {e}")
return False
def extract_article_data(entry) -> Dict[str, str]:
"""Extract relevant data from a feed entry."""
return {
"title": getattr(entry, "title", "No Title"),
"link": getattr(entry, "link", ""),
"summary": getattr(entry, "summary", ""),
"published": getattr(entry, "published", ""),
"content": get_entry_content(entry),
}
def get_entry_content(entry) -> str:
"""Extract the best available content from an entry."""
# Try to get full content first
if hasattr(entry, "content") and entry.content:
return entry.content[0].value if entry.content else ""
# Fall back to summary
return getattr(entry, "summary", "")
def filter_todays_articles(feed: feedparser.FeedParserDict) -> List[Dict[str, str]]:
"""Filter articles published today from the RSS feed."""
todays_articles = []
for entry in feed.entries:
# Check multiple possible date fields
published_date = getattr(entry, "published", None)
updated_date = getattr(entry, "updated", None)
entry_date = published_date or updated_date
if is_today(entry_date):
article_data = extract_article_data(entry)
todays_articles.append(article_data)
logger.info(f"Found today's article: {article_data['title']}")
logger.info(f"Found {len(todays_articles)} articles from today")
return todays_articles
def get_todays_articles(rss_url: str) -> List[Dict[str, str]]:
"""Main function to fetch and filter today's articles from an RSS feed."""
feed = fetch_rss_feed(rss_url)
return filter_todays_articles(feed)