Initial untested implementation

2025-08-04 15:56:02 -04:00 · 2025-08-04 15:56:02 -04:00 · 193a50dd3b
commit 193a50dd3b
parent 24200b899a
8 changed files with 917 additions and 11 deletions
--- a/src/rss2newsletter/init.py
+++ b/src/rss2newsletter/init.py
@ -1,2 +1,28 @@
-def main() -> None:
-    print("Hello from rss2newsletter!")
+"""
+rss2newsletter - A simple tool for generating HTML newsletters from RSS feeds.
+
+This package provides functionality to:
+- Fetch articles from RSS feeds
+- Filter articles published today
+- Generate AI summaries using Ollama
+- Create clean HTML newsletters
+"""
+
+__version__ = "0.1.0"
+__author__ = "Your Name"
+__email__ = "your.email@example.com"
+
+from .rss_fetcher import get_todays_articles
+from .ollama_client import create_ollama_client, summarize_articles
+from .html_generator import generate_newsletter_html, save_newsletter_html
+from .config import get_config_from_env, setup_logging
+
+__all__ = [
+    "get_todays_articles",
+    "create_ollama_client",
+    "summarize_articles",
+    "generate_newsletter_html",
+    "save_newsletter_html",
+    "get_config_from_env",
+    "setup_logging",
+]
--- a/src/rss2newsletter/main.py
+++ b/src/rss2newsletter/main.py
@ -0,0 +1,161 @@
+"""Main entry point for rss2newsletter."""
+
+import sys
+import argparse
+import logging
+from typing import List, Dict
+
+from .config import get_config_from_env, setup_logging
+from .rss_fetcher import get_todays_articles
+from .ollama_client import create_ollama_client, summarize_articles
+from .html_generator import generate_newsletter_html, save_newsletter_html
+
+logger = logging.getLogger(__name__)
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    """Create command line argument parser."""
+    parser = argparse.ArgumentParser(
+        description="Generate HTML newsletter from RSS feed using Ollama AI summaries",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m rss2newsletter https://feeds.example.com/rss
+  python -m rss2newsletter https://blog.example.com/feed.xml --output my_newsletter.html
+  python -m rss2newsletter https://news.example.com/rss --model llama3.1
+        """,
+    )
+
+    parser.add_argument("rss_url", help="RSS feed URL to process")
+
+    parser.add_argument(
+        "--output",
+        "-o",
+        default=None,
+        help="Output HTML filename (default: newsletter.html)",
+    )
+
+    parser.add_argument(
+        "--model",
+        "-m",
+        default=None,
+        help="Ollama model to use for summaries (default: llama3.2)",
+    )
+
+    parser.add_argument(
+        "--ollama-url",
+        default=None,
+        help="Ollama server URL (default: http://localhost:11434)",
+    )
+
+    parser.add_argument(
+        "--title", "-t", default=None, help="Newsletter title (default: RSS Newsletter)"
+    )
+
+    parser.add_argument(
+        "--verbose", "-v", action="store_true", help="Enable verbose logging"
+    )
+
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Fetch articles but don't generate summaries or save HTML",
+    )
+
+    return parser
+
+
+def merge_config_with_args(config: Dict, args: argparse.Namespace) -> Dict:
+    """Merge configuration with command line arguments."""
+    if args.output:
+        config["output"]["filename"] = args.output
+
+    if args.model:
+        config["ollama"]["model"] = args.model
+
+    if args.ollama_url:
+        config["ollama"]["base_url"] = args.ollama_url
+
+    if args.title:
+        config["output"]["feed_title"] = args.title
+
+    if args.verbose:
+        config["logging"]["level"] = "DEBUG"
+
+    return config
+
+
+def process_rss_to_newsletter(rss_url: str, config: Dict, dry_run: bool = False) -> str:
+    """Main processing pipeline for RSS to newsletter conversion."""
+    logger.info(f"Starting RSS newsletter generation for: {rss_url}")
+
+    # Step 1: Fetch today's articles
+    logger.info("Fetching articles from RSS feed...")
+    articles = get_todays_articles(rss_url)
+
+    if not articles:
+        logger.warning("No articles found for today")
+        return generate_newsletter_html([], config["output"]["feed_title"])
+
+    logger.info(f"Found {len(articles)} articles from today")
+
+    if dry_run:
+        logger.info("Dry run mode - stopping before summarization")
+        for i, article in enumerate(articles, 1):
+            logger.info(f"Article {i}: {article['title']}")
+        return ""
+
+    # Step 2: Create Ollama client and summarize articles
+    logger.info("Generating AI summaries...")
+    ollama_client = create_ollama_client(config["ollama"]["base_url"])
+
+    summarized_articles = summarize_articles(
+        ollama_client, articles, config["ollama"]["model"]
+    )
+
+    # Step 3: Generate HTML newsletter
+    logger.info("Generating HTML newsletter...")
+    html_content = generate_newsletter_html(
+        summarized_articles, config["output"]["feed_title"]
+    )
+
+    return html_content
+
+
+def main() -> int:
+    """Main entry point."""
+    parser = create_argument_parser()
+    args = parser.parse_args()
+
+    # Load and merge configuration
+    config = get_config_from_env()
+    config = merge_config_with_args(config, args)
+
+    # Setup logging
+    setup_logging(config)
+
+    try:
+        # Process RSS feed to newsletter
+        html_content = process_rss_to_newsletter(args.rss_url, config, args.dry_run)
+
+        if not args.dry_run and html_content:
+            # Save HTML file
+            output_filename = save_newsletter_html(
+                html_content, config["output"]["filename"]
+            )
+            print(f"Newsletter saved to: {output_filename}")
+
+        return 0
+
+    except KeyboardInterrupt:
+        logger.info("Process interrupted by user")
+        return 1
+    except Exception as e:
+        logger.error(f"Error generating newsletter: {e}")
+        if config["logging"]["level"] == "DEBUG":
+            logger.exception("Full error details:")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/src/rss2newsletter/config.py
+++ b/src/rss2newsletter/config.py
@ -0,0 +1,75 @@
+"""Configuration management for rss2newsletter."""
+
+import os
+from typing import Dict, Any
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def get_default_config() -> Dict[str, Any]:
+    """Get default configuration values."""
+    return {
+        "ollama": {
+            "base_url": "http://localhost:11434",
+            "model": "llama3.2",
+            "timeout": 60,
+            "max_summary_length": 150,
+        },
+        "output": {"filename": "newsletter.html", "feed_title": "RSS Newsletter"},
+        "logging": {
+            "level": "INFO",
+            "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        },
+    }
+
+
+def get_config_from_env() -> Dict[str, Any]:
+    """Get configuration from environment variables."""
+    config = get_default_config()
+
+    # Ollama configuration
+    if os.getenv("OLLAMA_BASE_URL"):
+        config["ollama"]["base_url"] = os.getenv("OLLAMA_BASE_URL")
+
+    if os.getenv("OLLAMA_MODEL"):
+        config["ollama"]["model"] = os.getenv("OLLAMA_MODEL")
+
+    if os.getenv("OLLAMA_TIMEOUT"):
+        try:
+            config["ollama"]["timeout"] = int(os.getenv("OLLAMA_TIMEOUT"))
+        except ValueError:
+            logger.warning("Invalid OLLAMA_TIMEOUT value, using default")
+
+    if os.getenv("MAX_SUMMARY_LENGTH"):
+        try:
+            config["ollama"]["max_summary_length"] = int(
+                os.getenv("MAX_SUMMARY_LENGTH")
+            )
+        except ValueError:
+            logger.warning("Invalid MAX_SUMMARY_LENGTH value, using default")
+
+    # Output configuration
+    if os.getenv("OUTPUT_FILENAME"):
+        config["output"]["filename"] = os.getenv("OUTPUT_FILENAME")
+
+    if os.getenv("FEED_TITLE"):
+        config["output"]["feed_title"] = os.getenv("FEED_TITLE")
+
+    # Logging configuration
+    if os.getenv("LOG_LEVEL"):
+        config["logging"]["level"] = os.getenv("LOG_LEVEL").upper()
+
+    return config
+
+
+def setup_logging(config: Dict[str, Any]) -> None:
+    """Setup logging based on configuration."""
+    log_level = getattr(logging, config["logging"]["level"], logging.INFO)
+    log_format = config["logging"]["format"]
+
+    logging.basicConfig(level=log_level, format=log_format, datefmt="%Y-%m-%d %H:%M:%S")
+
+    # Reduce noise from external libraries
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+    logging.getLogger("requests").setLevel(logging.WARNING)
--- a/src/rss2newsletter/html_generator.py
+++ b/src/rss2newsletter/html_generator.py
@ -0,0 +1,227 @@
+"""HTML newsletter generation functionality."""
+
+from datetime import datetime
+from typing import List, Dict
+import html
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def escape_html(text: str) -> str:
+    """Safely escape HTML in text content."""
+    return html.escape(str(text)) if text else ""
+
+
+def create_article_html(article: Dict[str, str]) -> str:
+    """Generate HTML for a single article."""
+    title = escape_html(article.get("title", "No Title"))
+    link = escape_html(article.get("link", ""))
+    ai_summary = escape_html(article.get("ai_summary", "No summary available"))
+
+    return f"""
+    <div class="article">
+        <h2 class="article-title">
+            <a href="{link}" target="_blank">{title}</a>
+        </h2>
+        <div class="article-summary">
+            <p>{ai_summary}</p>
+        </div>
+        <div class="article-link">
+            <a href="{link}" target="_blank" class="read-more">Read full article →</a>
+        </div>
+    </div>
+    """
+
+
+def create_css_styles() -> str:
+    """Generate CSS styles for the newsletter."""
+    return """
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f9f9f9;
+        }
+        
+        .newsletter {
+            background: white;
+            border-radius: 8px;
+            padding: 30px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }
+        
+        .header {
+            text-align: center;
+            border-bottom: 2px solid #eee;
+            padding-bottom: 20px;
+            margin-bottom: 30px;
+        }
+        
+        .header h1 {
+            color: #2c3e50;
+            margin: 0;
+            font-size: 2.5em;
+        }
+        
+        .header .date {
+            color: #7f8c8d;
+            font-size: 1.1em;
+            margin-top: 10px;
+        }
+        
+        .article {
+            margin-bottom: 40px;
+            padding-bottom: 30px;
+            border-bottom: 1px solid #eee;
+        }
+        
+        .article:last-child {
+            border-bottom: none;
+            margin-bottom: 0;
+        }
+        
+        .article-title {
+            margin: 0 0 15px 0;
+            font-size: 1.4em;
+        }
+        
+        .article-title a {
+            color: #2980b9;
+            text-decoration: none;
+        }
+        
+        .article-title a:hover {
+            color: #3498db;
+            text-decoration: underline;
+        }
+        
+        .article-summary {
+            margin: 15px 0;
+            background: #f8f9fa;
+            padding: 15px;
+            border-radius: 5px;
+            border-left: 4px solid #3498db;
+        }
+        
+        .article-summary p {
+            margin: 0;
+            color: #555;
+        }
+        
+        .article-link {
+            margin-top: 15px;
+        }
+        
+        .read-more {
+            color: #e74c3c;
+            text-decoration: none;
+            font-weight: 500;
+        }
+        
+        .read-more:hover {
+            text-decoration: underline;
+        }
+        
+        .footer {
+            margin-top: 40px;
+            padding-top: 20px;
+            border-top: 1px solid #eee;
+            text-align: center;
+            color: #7f8c8d;
+            font-size: 0.9em;
+        }
+        
+        .no-articles {
+            text-align: center;
+            color: #7f8c8d;
+            font-style: italic;
+            padding: 40px 20px;
+        }
+    </style>
+    """
+
+
+def create_header_html(feed_title: str = "RSS Newsletter") -> str:
+    """Generate the newsletter header."""
+    current_date = datetime.now().strftime("%B %d, %Y")
+
+    return f"""
+    <div class="header">
+        <h1>📰 {escape_html(feed_title)}</h1>
+        <div class="date">Daily Summary for {current_date}</div>
+    </div>
+    """
+
+
+def create_footer_html() -> str:
+    """Generate the newsletter footer."""
+    return """
+    <div class="footer">
+        <p>Generated by rss2newsletter 🤖</p>
+        <p>Powered by Ollama AI summaries</p>
+    </div>
+    """
+
+
+def create_no_articles_html() -> str:
+    """Generate HTML for when no articles are found."""
+    return """
+    <div class="no-articles">
+        <h2>📭 No articles found for today</h2>
+        <p>Check back tomorrow for fresh content!</p>
+    </div>
+    """
+
+
+def generate_newsletter_html(
+    articles: List[Dict[str, str]], feed_title: str = "RSS Newsletter"
+) -> str:
+    """Generate complete HTML newsletter from articles."""
+    logger.info(f"Generating HTML newsletter with {len(articles)} articles")
+
+    # Generate article HTML
+    if articles:
+        articles_html = "\n".join(create_article_html(article) for article in articles)
+    else:
+        articles_html = create_no_articles_html()
+
+    # Combine all parts
+    html_content = f"""
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>{escape_html(feed_title)} - Daily Summary</title>
+        {create_css_styles()}
+    </head>
+    <body>
+        <div class="newsletter">
+            {create_header_html(feed_title)}
+            {articles_html}
+            {create_footer_html()}
+        </div>
+    </body>
+    </html>
+    """
+
+    return html_content
+
+
+def save_newsletter_html(html_content: str, filename: str = "newsletter.html") -> str:
+    """Save HTML content to file and return the filename."""
+    try:
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(html_content)
+
+        logger.info(f"Newsletter saved to: {filename}")
+        return filename
+
+    except IOError as e:
+        logger.error(f"Error saving newsletter: {e}")
+        raise
--- a/src/rss2newsletter/ollama_client.py
+++ b/src/rss2newsletter/ollama_client.py
@ -0,0 +1,112 @@
+"""Ollama API client for generating summaries."""
+
+import requests
+import json
+from typing import Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def create_ollama_client(base_url: str = "http://localhost:11434") -> Dict[str, str]:
+    """Create an Ollama client configuration."""
+    return {
+        "base_url": base_url,
+        "generate_endpoint": f"{base_url}/api/generate",
+        "chat_endpoint": f"{base_url}/api/chat",
+    }
+
+
+def test_ollama_connection(client_config: Dict[str, str]) -> bool:
+    """Test if Ollama server is accessible."""
+    try:
+        response = requests.get(f"{client_config['base_url']}/api/tags", timeout=5)
+        return response.status_code == 200
+    except requests.RequestException as e:
+        logger.error(f"Failed to connect to Ollama server: {e}")
+        return False
+
+
+def generate_summary(
+    client_config: Dict[str, str],
+    content: str,
+    model: str = "llama3.2",
+    max_length: int = 150,
+) -> Optional[str]:
+    """Generate a summary of the given content using Ollama."""
+    if not content.strip():
+        return "No content available for summarization."
+
+    prompt = f"""Please provide a concise summary of the following article in approximately {max_length} words. Focus on the key points and main ideas:
+
+{content}
+
+Summary:"""
+
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "options": {
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "max_tokens": max_length * 2,  # Allow some buffer
+        },
+    }
+
+    try:
+        logger.info(f"Generating summary using model: {model}")
+        response = requests.post(
+            client_config["generate_endpoint"], json=payload, timeout=60
+        )
+        response.raise_for_status()
+
+        result = response.json()
+        summary = result.get("response", "").strip()
+
+        if not summary:
+            logger.warning("Received empty summary from Ollama")
+            return "Summary could not be generated."
+
+        return summary
+
+    except requests.RequestException as e:
+        logger.error(f"Error communicating with Ollama: {e}")
+        return f"Error generating summary: {str(e)}"
+    except json.JSONDecodeError as e:
+        logger.error(f"Error parsing Ollama response: {e}")
+        return "Error: Invalid response from Ollama server."
+
+
+def summarize_article(
+    client_config: Dict[str, str], article: Dict[str, str], model: str = "llama3.2"
+) -> Dict[str, str]:
+    """Summarize a single article and return enriched article data."""
+    # Use content if available, otherwise fall back to summary
+    content_to_summarize = article.get("content") or article.get("summary", "")
+
+    # Generate AI summary
+    ai_summary = generate_summary(client_config, content_to_summarize, model)
+
+    # Return enriched article data
+    return {**article, "ai_summary": ai_summary or "Summary unavailable."}
+
+
+def summarize_articles(
+    client_config: Dict[str, str], articles: list, model: str = "llama3.2"
+) -> list:
+    """Summarize multiple articles using functional approach."""
+    if not test_ollama_connection(client_config):
+        logger.error("Cannot connect to Ollama server. Summaries will be unavailable.")
+        # Return articles with placeholder summaries
+        return [
+            {
+                **article,
+                "ai_summary": "Summary unavailable - Ollama server not accessible.",
+            }
+            for article in articles
+        ]
+
+    logger.info(f"Summarizing {len(articles)} articles")
+
+    return [summarize_article(client_config, article, model) for article in articles]
--- a/src/rss2newsletter/rss_fetcher.py
+++ b/src/rss2newsletter/rss_fetcher.py
@ -0,0 +1,93 @@
+"""RSS feed fetching functionality."""
+
+import feedparser
+from datetime import datetime, timezone
+from dateutil import parser as date_parser
+from typing import List, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def fetch_rss_feed(url: str) -> feedparser.FeedParserDict:
+    """Fetch and parse RSS feed from URL."""
+    try:
+        logger.info(f"Fetching RSS feed from: {url}")
+        feed = feedparser.parse(url)
+
+        if feed.bozo:
+            logger.warning(f"Feed parsing warning: {feed.bozo_exception}")
+
+        return feed
+    except Exception as e:
+        logger.error(f"Error fetching RSS feed: {e}")
+        raise
+
+
+def is_today(entry_date: Optional[str]) -> bool:
+    """Check if an entry was published today."""
+    if not entry_date:
+        return False
+
+    try:
+        # Parse the entry date
+        parsed_date = date_parser.parse(entry_date)
+
+        # Make sure it's timezone-aware
+        if parsed_date.tzinfo is None:
+            parsed_date = parsed_date.replace(tzinfo=timezone.utc)
+
+        # Get today's date in UTC
+        today = datetime.now(timezone.utc).date()
+
+        return parsed_date.date() == today
+    except Exception as e:
+        logger.warning(f"Error parsing date '{entry_date}': {e}")
+        return False
+
+
+def extract_article_data(entry) -> Dict[str, str]:
+    """Extract relevant data from a feed entry."""
+    return {
+        "title": getattr(entry, "title", "No Title"),
+        "link": getattr(entry, "link", ""),
+        "summary": getattr(entry, "summary", ""),
+        "published": getattr(entry, "published", ""),
+        "content": get_entry_content(entry),
+    }
+
+
+def get_entry_content(entry) -> str:
+    """Extract the best available content from an entry."""
+    # Try to get full content first
+    if hasattr(entry, "content") and entry.content:
+        return entry.content[0].value if entry.content else ""
+
+    # Fall back to summary
+    return getattr(entry, "summary", "")
+
+
+def filter_todays_articles(feed: feedparser.FeedParserDict) -> List[Dict[str, str]]:
+    """Filter articles published today from the RSS feed."""
+    todays_articles = []
+
+    for entry in feed.entries:
+        # Check multiple possible date fields
+        published_date = getattr(entry, "published", None)
+        updated_date = getattr(entry, "updated", None)
+
+        entry_date = published_date or updated_date
+
+        if is_today(entry_date):
+            article_data = extract_article_data(entry)
+            todays_articles.append(article_data)
+            logger.info(f"Found today's article: {article_data['title']}")
+
+    logger.info(f"Found {len(todays_articles)} articles from today")
+    return todays_articles
+
+
+def get_todays_articles(rss_url: str) -> List[Dict[str, str]]:
+    """Main function to fetch and filter today's articles from an RSS feed."""
+    feed = fetch_rss_feed(rss_url)
+    return filter_todays_articles(feed)