Initial version

2026-02-12 23:55:07 -05:00 · 2026-02-12 23:55:07 -05:00 · 42996b0f4e
commit 42996b0f4e
31 changed files with 933 additions and 0 deletions
--- a/build/lib/citer/init.py
+++ b/build/lib/citer/init.py
@ -0,0 +1,3 @@
+"""APA citation helper CLI."""
+
+__all__ = ["cli", "fetchers", "formatter", "identifiers", "models"]
--- a/build/lib/citer/cli.py
+++ b/build/lib/citer/cli.py
@ -0,0 +1,48 @@
+import argparse
+import sys
+
+from .exceptions import CitationError
+from .fetchers import fetch_arxiv, fetch_doi
+from .formatter import format_apa
+from .identifiers import normalize_arxiv_identifier, normalize_doi
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Create an APA style citation from an arXiv ID or DOI."
+    )
+    subparsers = parser.add_subparsers(dest="source", required=True)
+
+    arxiv_parser = subparsers.add_parser("arxiv", help="Cite an arXiv identifier or URL")
+    arxiv_parser.add_argument("identifier", help="arXiv ID or URL")
+
+    doi_parser = subparsers.add_parser("doi", help="Cite a DOI or DOI URL")
+    doi_parser.add_argument("identifier", help="DOI or DOI URL")
+
+    return parser
+
+
+def main(argv=None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+
+    try:
+        if args.source == "arxiv":
+            arxiv_id = normalize_arxiv_identifier(args.identifier)
+            metadata = fetch_arxiv(arxiv_id)
+        elif args.source == "doi":
+            doi = normalize_doi(args.identifier)
+            metadata = fetch_doi(doi)
+        else:
+            parser.error("Unsupported source")
+            return 1
+    except CitationError as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        return 1
+
+    print(format_apa(metadata))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/build/lib/citer/exceptions.py
+++ b/build/lib/citer/exceptions.py
@ -0,0 +1,2 @@
+class CitationError(Exception):
+    """Raised when a citation cannot be created from the provided input."""
--- a/build/lib/citer/fetchers.py
+++ b/build/lib/citer/fetchers.py
@ -0,0 +1,158 @@
+import datetime as _dt
+from typing import List, Optional
+from urllib.parse import quote
+from xml.etree import ElementTree
+
+import requests
+
+from .exceptions import CitationError
+from .models import Author, WorkMetadata
+
+
+ARXIV_API = "http://export.arxiv.org/api/query"
+CROSSREF_WORKS = "https://api.crossref.org/works/"
+USER_AGENT = "citer-cli/0.1 (mailto:unknown@example.com)"
+
+
+def fetch_arxiv(arxiv_id: str) -> WorkMetadata:
+    try:
+        response = requests.get(
+            ARXIV_API,
+            params={"id_list": arxiv_id},
+            headers={"User-Agent": USER_AGENT},
+            timeout=10,
+        )
+        response.raise_for_status()
+    except requests.RequestException as exc:
+        raise CitationError(f"Failed to reach arXiv API: {exc}") from exc
+
+    try:
+        root = ElementTree.fromstring(response.text)
+    except ElementTree.ParseError as exc:
+        raise CitationError("Received invalid XML from arXiv") from exc
+
+    ns = {
+        "atom": "http://www.w3.org/2005/Atom",
+        "arxiv": "http://arxiv.org/schemas/atom",
+    }
+    entry = root.find("atom:entry", ns)
+    if entry is None:
+        raise CitationError(f"No entry found for arXiv ID {arxiv_id}")
+
+    title = _clean(entry.findtext("atom:title", default="", namespaces=ns))
+    authors = _parse_arxiv_authors(entry.findall("atom:author", ns))
+    year = _parse_year(entry.findtext("atom:published", default="", namespaces=ns))
+    doi = _clean(entry.findtext("arxiv:doi", default="", namespaces=ns)) or None
+
+    url = f"https://arxiv.org/abs/{arxiv_id}"
+    return WorkMetadata(
+        title=title,
+        authors=authors,
+        year=year,
+        container_title="arXiv preprint",
+        doi=doi,
+        url=url,
+        source="arxiv",
+        identifier=arxiv_id,
+    )
+
+
+def fetch_doi(doi: str) -> WorkMetadata:
+    url = CROSSREF_WORKS + quote(doi)
+    try:
+        response = requests.get(
+            url, headers={"Accept": "application/json", "User-Agent": USER_AGENT}, timeout=10
+        )
+        response.raise_for_status()
+        data = response.json()
+    except requests.RequestException as exc:
+        raise CitationError(f"Failed to reach Crossref for DOI {doi}: {exc}") from exc
+    except ValueError as exc:
+        raise CitationError("Received invalid JSON from Crossref") from exc
+
+    message = data.get("message", {})
+    title = _clean(" ".join(message.get("title", [])).strip())
+    authors = _parse_crossref_authors(message.get("author", []))
+    year = _extract_year(message)
+    container_title = _clean((message.get("container-title") or [""])[0])
+    volume = _clean(message.get("volume", "")).strip() or None
+    issue = _clean(message.get("issue", "")).strip() or None
+    pages = _clean(message.get("page", "")).strip() or None
+    url = message.get("URL") or f"https://doi.org/{doi}"
+
+    return WorkMetadata(
+        title=title,
+        authors=authors,
+        year=year,
+        container_title=container_title or None,
+        volume=volume,
+        issue=issue,
+        pages=pages,
+        doi=doi,
+        url=url,
+        source="doi",
+        identifier=doi,
+    )
+
+
+def _clean(value: str) -> str:
+    return " ".join(value.split())
+
+
+def _parse_arxiv_authors(author_elements) -> List[Author]:
+    authors: List[Author] = []
+    for element in author_elements:
+        name = element.findtext("{http://www.w3.org/2005/Atom}name", default="")
+        given, family = _split_author_name(name)
+        authors.append(Author(given=given, family=family))
+    return authors
+
+
+def _parse_crossref_authors(author_data) -> List[Author]:
+    authors: List[Author] = []
+    for author in author_data:
+        if "family" in author or "given" in author:
+            given = author.get("given", "").strip()
+            family = author.get("family", "").strip()
+        elif "name" in author:
+            given, family = _split_author_name(author.get("name", ""))
+        else:
+            continue
+        authors.append(Author(given=given, family=family))
+    return authors
+
+
+def _split_author_name(name: str) -> tuple[str, str]:
+    clean_name = _clean(name)
+    if not clean_name:
+        return "", ""
+    if "," in clean_name:
+        family, given = [part.strip() for part in clean_name.split(",", 1)]
+    else:
+        parts = clean_name.split()
+        family = parts[-1]
+        given = " ".join(parts[:-1])
+    return given, family
+
+
+def _parse_year(value: str) -> Optional[int]:
+    value = value.strip()
+    if not value:
+        return None
+    try:
+        return _dt.datetime.fromisoformat(value).year
+    except ValueError:
+        if len(value) >= 4 and value[:4].isdigit():
+            return int(value[:4])
+    return None
+
+
+def _extract_year(message: dict) -> int | None:
+    for key in ("published-print", "published-online", "issued"):
+        data = message.get(key, {})
+        parts = data.get("date-parts") if isinstance(data, dict) else None
+        if parts and len(parts) and len(parts[0]):
+            maybe_year = parts[0][0]
+            if isinstance(maybe_year, int):
+                return maybe_year
+    return None
--- a/build/lib/citer/formatter.py
+++ b/build/lib/citer/formatter.py
@ -0,0 +1,90 @@
+from typing import Iterable
+
+from .models import Author, WorkMetadata
+
+
+def format_apa(metadata: WorkMetadata) -> str:
+    authors_text = format_authors(metadata.authors)
+    year_text = f"({metadata.year})." if metadata.year else "(n.d.)."
+    title_text = _sentence_case(metadata.title).rstrip(".") + "."
+    container_text = _format_container(metadata)
+
+    parts = [authors_text, year_text, title_text]
+    if container_text:
+        parts.append(container_text)
+
+    link = None
+    if metadata.doi:
+        link = f"https://doi.org/{metadata.doi}"
+    elif metadata.url:
+        link = metadata.url
+    if link:
+        parts.append(link)
+
+    return " ".join(part.strip() for part in parts if part).strip()
+
+
+def format_authors(authors: Iterable[Author]) -> str:
+    formatted = [format_author(author) for author in authors if format_author(author)]
+    if not formatted:
+        return ""
+    if len(formatted) == 1:
+        return formatted[0]
+    if len(formatted) == 2:
+        return f"{formatted[0]}, & {formatted[1]}"
+    return ", ".join(formatted[:-1]) + f", & {formatted[-1]}"
+
+
+def format_author(author: Author) -> str:
+    given_initials = " ".join(_initial(part) for part in author.given.split() if part)
+    family = author.family.strip()
+    if family and given_initials:
+        return f"{family}, {given_initials}"
+    if family:
+        return family
+    return given_initials
+
+
+def _initial(part: str) -> str:
+    clean = part.strip()
+    if not clean:
+        return ""
+    return f"{clean[0].upper()}."
+
+
+def _sentence_case(text: str) -> str:
+    stripped = text.strip()
+    if not stripped:
+        return stripped
+    lower = stripped[0].upper() + stripped[1:]
+    return lower
+
+
+def _format_container(metadata: WorkMetadata) -> str:
+    if not metadata.container_title and not metadata.volume and not metadata.pages:
+        return ""
+
+    pieces = []
+    if metadata.container_title:
+        pieces.append(metadata.container_title)
+    volume_issue = ""
+    if metadata.volume:
+        volume_issue = metadata.volume
+        if metadata.issue:
+            volume_issue += f"({metadata.issue})"
+    elif metadata.issue:
+        volume_issue = f"({metadata.issue})"
+
+    if volume_issue:
+        pieces.append(volume_issue)
+
+    if metadata.source == "arxiv" and metadata.identifier:
+        pieces.append(f"arXiv:{metadata.identifier}")
+
+    if metadata.pages:
+        pieces.append(metadata.pages)
+
+    container = ", ".join(pieces)
+    if container and not container.endswith("."):
+        container += "."
+    return container
--- a/build/lib/citer/identifiers.py
+++ b/build/lib/citer/identifiers.py
@ -0,0 +1,50 @@
+import re
+from urllib.parse import unquote
+
+from .exceptions import CitationError
+
+
+ARXIV_ID = re.compile(
+    r"^(?P<id>(?:\d{4}\.\d{4,5}|[a-z-]+\/\d{7}))(?:v\d+)?$",
+    flags=re.IGNORECASE,
+)
+DOI_PATTERN = re.compile(r"10\.\d{4,9}/\S+", flags=re.IGNORECASE)
+
+
+def normalize_arxiv_identifier(raw: str) -> str:
+    text = raw.strip()
+    text = unquote(text)
+
+    url_match = re.search(r"arxiv\.org/(?:abs|pdf)/(?P<id>[^?#/]+)", text, re.IGNORECASE)
+    if url_match:
+        text = url_match.group("id")
+
+    text = re.sub(r"(?i)^arxiv:", "", text)
+    text = re.sub(r"(?i)\.pdf$", "", text)
+    text = text.split("?")[0]
+    text = text.strip()
+
+    match = ARXIV_ID.match(text)
+    if not match:
+        raise CitationError(f"Could not understand arXiv identifier: {raw}")
+
+    base_id = match.group("id")
+    base_id = re.sub(r"(?i)v\d+$", "", base_id)
+    return base_id
+
+
+def normalize_doi(raw: str) -> str:
+    text = raw.strip()
+    text = unquote(text)
+
+    text = re.sub(r"(?i)^https?://(dx\.)?doi\.org/", "", text)
+    text = re.sub(r"(?i)^doi:\s*", "", text)
+    text = text.split("?")[0]
+    text = text.strip()
+
+    match = DOI_PATTERN.search(text)
+    if not match:
+        raise CitationError(f"Could not understand DOI: {raw}")
+
+    doi = match.group(0)
+    return doi.lower()
--- a/build/lib/citer/models.py
+++ b/build/lib/citer/models.py
@ -0,0 +1,27 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+@dataclass
+class Author:
+    given: str
+    family: str
+
+    @property
+    def full_name(self) -> str:
+        return f"{self.given} {self.family}".strip()
+
+
+@dataclass
+class WorkMetadata:
+    title: str
+    authors: List[Author]
+    year: Optional[int]
+    container_title: Optional[str] = None
+    volume: Optional[str] = None
+    issue: Optional[str] = None
+    pages: Optional[str] = None
+    doi: Optional[str] = None
+    url: Optional[str] = None
+    source: str = ""
+    identifier: Optional[str] = None