Initial version

This commit is contained in:
Alex Selimov 2026-02-12 23:55:07 -05:00
commit 42996b0f4e
31 changed files with 933 additions and 0 deletions

View file

@ -0,0 +1,3 @@
"""APA citation helper CLI."""
__all__ = ["cli", "fetchers", "formatter", "identifiers", "models"]

48
build/lib/citer/cli.py Normal file
View file

@ -0,0 +1,48 @@
import argparse
import sys
from .exceptions import CitationError
from .fetchers import fetch_arxiv, fetch_doi
from .formatter import format_apa
from .identifiers import normalize_arxiv_identifier, normalize_doi
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Create an APA style citation from an arXiv ID or DOI."
)
subparsers = parser.add_subparsers(dest="source", required=True)
arxiv_parser = subparsers.add_parser("arxiv", help="Cite an arXiv identifier or URL")
arxiv_parser.add_argument("identifier", help="arXiv ID or URL")
doi_parser = subparsers.add_parser("doi", help="Cite a DOI or DOI URL")
doi_parser.add_argument("identifier", help="DOI or DOI URL")
return parser
def main(argv=None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
try:
if args.source == "arxiv":
arxiv_id = normalize_arxiv_identifier(args.identifier)
metadata = fetch_arxiv(arxiv_id)
elif args.source == "doi":
doi = normalize_doi(args.identifier)
metadata = fetch_doi(doi)
else:
parser.error("Unsupported source")
return 1
except CitationError as exc:
print(f"Error: {exc}", file=sys.stderr)
return 1
print(format_apa(metadata))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,2 @@
class CitationError(Exception):
"""Raised when a citation cannot be created from the provided input."""

158
build/lib/citer/fetchers.py Normal file
View file

@ -0,0 +1,158 @@
import datetime as _dt
from typing import List, Optional
from urllib.parse import quote
from xml.etree import ElementTree
import requests
from .exceptions import CitationError
from .models import Author, WorkMetadata
ARXIV_API = "http://export.arxiv.org/api/query"
CROSSREF_WORKS = "https://api.crossref.org/works/"
USER_AGENT = "citer-cli/0.1 (mailto:unknown@example.com)"
def fetch_arxiv(arxiv_id: str) -> WorkMetadata:
try:
response = requests.get(
ARXIV_API,
params={"id_list": arxiv_id},
headers={"User-Agent": USER_AGENT},
timeout=10,
)
response.raise_for_status()
except requests.RequestException as exc:
raise CitationError(f"Failed to reach arXiv API: {exc}") from exc
try:
root = ElementTree.fromstring(response.text)
except ElementTree.ParseError as exc:
raise CitationError("Received invalid XML from arXiv") from exc
ns = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
entry = root.find("atom:entry", ns)
if entry is None:
raise CitationError(f"No entry found for arXiv ID {arxiv_id}")
title = _clean(entry.findtext("atom:title", default="", namespaces=ns))
authors = _parse_arxiv_authors(entry.findall("atom:author", ns))
year = _parse_year(entry.findtext("atom:published", default="", namespaces=ns))
doi = _clean(entry.findtext("arxiv:doi", default="", namespaces=ns)) or None
url = f"https://arxiv.org/abs/{arxiv_id}"
return WorkMetadata(
title=title,
authors=authors,
year=year,
container_title="arXiv preprint",
doi=doi,
url=url,
source="arxiv",
identifier=arxiv_id,
)
def fetch_doi(doi: str) -> WorkMetadata:
url = CROSSREF_WORKS + quote(doi)
try:
response = requests.get(
url, headers={"Accept": "application/json", "User-Agent": USER_AGENT}, timeout=10
)
response.raise_for_status()
data = response.json()
except requests.RequestException as exc:
raise CitationError(f"Failed to reach Crossref for DOI {doi}: {exc}") from exc
except ValueError as exc:
raise CitationError("Received invalid JSON from Crossref") from exc
message = data.get("message", {})
title = _clean(" ".join(message.get("title", [])).strip())
authors = _parse_crossref_authors(message.get("author", []))
year = _extract_year(message)
container_title = _clean((message.get("container-title") or [""])[0])
volume = _clean(message.get("volume", "")).strip() or None
issue = _clean(message.get("issue", "")).strip() or None
pages = _clean(message.get("page", "")).strip() or None
url = message.get("URL") or f"https://doi.org/{doi}"
return WorkMetadata(
title=title,
authors=authors,
year=year,
container_title=container_title or None,
volume=volume,
issue=issue,
pages=pages,
doi=doi,
url=url,
source="doi",
identifier=doi,
)
def _clean(value: str) -> str:
return " ".join(value.split())
def _parse_arxiv_authors(author_elements) -> List[Author]:
authors: List[Author] = []
for element in author_elements:
name = element.findtext("{http://www.w3.org/2005/Atom}name", default="")
given, family = _split_author_name(name)
authors.append(Author(given=given, family=family))
return authors
def _parse_crossref_authors(author_data) -> List[Author]:
authors: List[Author] = []
for author in author_data:
if "family" in author or "given" in author:
given = author.get("given", "").strip()
family = author.get("family", "").strip()
elif "name" in author:
given, family = _split_author_name(author.get("name", ""))
else:
continue
authors.append(Author(given=given, family=family))
return authors
def _split_author_name(name: str) -> tuple[str, str]:
clean_name = _clean(name)
if not clean_name:
return "", ""
if "," in clean_name:
family, given = [part.strip() for part in clean_name.split(",", 1)]
else:
parts = clean_name.split()
family = parts[-1]
given = " ".join(parts[:-1])
return given, family
def _parse_year(value: str) -> Optional[int]:
value = value.strip()
if not value:
return None
try:
return _dt.datetime.fromisoformat(value).year
except ValueError:
if len(value) >= 4 and value[:4].isdigit():
return int(value[:4])
return None
def _extract_year(message: dict) -> int | None:
for key in ("published-print", "published-online", "issued"):
data = message.get(key, {})
parts = data.get("date-parts") if isinstance(data, dict) else None
if parts and len(parts) and len(parts[0]):
maybe_year = parts[0][0]
if isinstance(maybe_year, int):
return maybe_year
return None

View file

@ -0,0 +1,90 @@
from typing import Iterable
from .models import Author, WorkMetadata
def format_apa(metadata: WorkMetadata) -> str:
authors_text = format_authors(metadata.authors)
year_text = f"({metadata.year})." if metadata.year else "(n.d.)."
title_text = _sentence_case(metadata.title).rstrip(".") + "."
container_text = _format_container(metadata)
parts = [authors_text, year_text, title_text]
if container_text:
parts.append(container_text)
link = None
if metadata.doi:
link = f"https://doi.org/{metadata.doi}"
elif metadata.url:
link = metadata.url
if link:
parts.append(link)
return " ".join(part.strip() for part in parts if part).strip()
def format_authors(authors: Iterable[Author]) -> str:
formatted = [format_author(author) for author in authors if format_author(author)]
if not formatted:
return ""
if len(formatted) == 1:
return formatted[0]
if len(formatted) == 2:
return f"{formatted[0]}, & {formatted[1]}"
return ", ".join(formatted[:-1]) + f", & {formatted[-1]}"
def format_author(author: Author) -> str:
given_initials = " ".join(_initial(part) for part in author.given.split() if part)
family = author.family.strip()
if family and given_initials:
return f"{family}, {given_initials}"
if family:
return family
return given_initials
def _initial(part: str) -> str:
clean = part.strip()
if not clean:
return ""
return f"{clean[0].upper()}."
def _sentence_case(text: str) -> str:
stripped = text.strip()
if not stripped:
return stripped
lower = stripped[0].upper() + stripped[1:]
return lower
def _format_container(metadata: WorkMetadata) -> str:
if not metadata.container_title and not metadata.volume and not metadata.pages:
return ""
pieces = []
if metadata.container_title:
pieces.append(metadata.container_title)
volume_issue = ""
if metadata.volume:
volume_issue = metadata.volume
if metadata.issue:
volume_issue += f"({metadata.issue})"
elif metadata.issue:
volume_issue = f"({metadata.issue})"
if volume_issue:
pieces.append(volume_issue)
if metadata.source == "arxiv" and metadata.identifier:
pieces.append(f"arXiv:{metadata.identifier}")
if metadata.pages:
pieces.append(metadata.pages)
container = ", ".join(pieces)
if container and not container.endswith("."):
container += "."
return container

View file

@ -0,0 +1,50 @@
import re
from urllib.parse import unquote
from .exceptions import CitationError
ARXIV_ID = re.compile(
r"^(?P<id>(?:\d{4}\.\d{4,5}|[a-z-]+\/\d{7}))(?:v\d+)?$",
flags=re.IGNORECASE,
)
DOI_PATTERN = re.compile(r"10\.\d{4,9}/\S+", flags=re.IGNORECASE)
def normalize_arxiv_identifier(raw: str) -> str:
text = raw.strip()
text = unquote(text)
url_match = re.search(r"arxiv\.org/(?:abs|pdf)/(?P<id>[^?#/]+)", text, re.IGNORECASE)
if url_match:
text = url_match.group("id")
text = re.sub(r"(?i)^arxiv:", "", text)
text = re.sub(r"(?i)\.pdf$", "", text)
text = text.split("?")[0]
text = text.strip()
match = ARXIV_ID.match(text)
if not match:
raise CitationError(f"Could not understand arXiv identifier: {raw}")
base_id = match.group("id")
base_id = re.sub(r"(?i)v\d+$", "", base_id)
return base_id
def normalize_doi(raw: str) -> str:
text = raw.strip()
text = unquote(text)
text = re.sub(r"(?i)^https?://(dx\.)?doi\.org/", "", text)
text = re.sub(r"(?i)^doi:\s*", "", text)
text = text.split("?")[0]
text = text.strip()
match = DOI_PATTERN.search(text)
if not match:
raise CitationError(f"Could not understand DOI: {raw}")
doi = match.group(0)
return doi.lower()

27
build/lib/citer/models.py Normal file
View file

@ -0,0 +1,27 @@
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class Author:
given: str
family: str
@property
def full_name(self) -> str:
return f"{self.given} {self.family}".strip()
@dataclass
class WorkMetadata:
title: str
authors: List[Author]
year: Optional[int]
container_title: Optional[str] = None
volume: Optional[str] = None
issue: Optional[str] = None
pages: Optional[str] = None
doi: Optional[str] = None
url: Optional[str] = None
source: str = ""
identifier: Optional[str] = None