commit 42996b0f4e7efbdc22b5c1a08f79d3b40d27e71b Author: Alex Selimov Date: Thu Feb 12 23:55:07 2026 -0500 Initial version diff --git a/README.md b/README.md new file mode 100644 index 0000000..e3f2252 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Citer + +Simple CLI that turns an arXiv ID/URL or DOI/URL into a single-line APA citation. + +## Setup + +```bash +pip install -e . +``` + +## Usage + +```bash +# arXiv IDs or URLs +citer arxiv 2106.01342 +citer arxiv https://arxiv.org/abs/2106.01342 + +# DOIs or DOI URLs +citer doi 10.1038/nphys1170 +citer doi https://doi.org/10.1038/nphys1170 +``` + +Errors are printed with a clear message if an ID cannot be parsed or a lookup fails. + +## Tests + +```bash +python -m pytest +``` diff --git a/build/lib/citer/__init__.py b/build/lib/citer/__init__.py new file mode 100644 index 0000000..747cc0f --- /dev/null +++ b/build/lib/citer/__init__.py @@ -0,0 +1,3 @@ +"""APA citation helper CLI.""" + +__all__ = ["cli", "fetchers", "formatter", "identifiers", "models"] diff --git a/build/lib/citer/cli.py b/build/lib/citer/cli.py new file mode 100644 index 0000000..5b38122 --- /dev/null +++ b/build/lib/citer/cli.py @@ -0,0 +1,48 @@ +import argparse +import sys + +from .exceptions import CitationError +from .fetchers import fetch_arxiv, fetch_doi +from .formatter import format_apa +from .identifiers import normalize_arxiv_identifier, normalize_doi + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Create an APA style citation from an arXiv ID or DOI." + ) + subparsers = parser.add_subparsers(dest="source", required=True) + + arxiv_parser = subparsers.add_parser("arxiv", help="Cite an arXiv identifier or URL") + arxiv_parser.add_argument("identifier", help="arXiv ID or URL") + + doi_parser = subparsers.add_parser("doi", help="Cite a DOI or DOI URL") + doi_parser.add_argument("identifier", help="DOI or DOI URL") + + return parser + + +def main(argv=None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + try: + if args.source == "arxiv": + arxiv_id = normalize_arxiv_identifier(args.identifier) + metadata = fetch_arxiv(arxiv_id) + elif args.source == "doi": + doi = normalize_doi(args.identifier) + metadata = fetch_doi(doi) + else: + parser.error("Unsupported source") + return 1 + except CitationError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + print(format_apa(metadata)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/build/lib/citer/exceptions.py b/build/lib/citer/exceptions.py new file mode 100644 index 0000000..6ac4e4b --- /dev/null +++ b/build/lib/citer/exceptions.py @@ -0,0 +1,2 @@ +class CitationError(Exception): + """Raised when a citation cannot be created from the provided input.""" diff --git a/build/lib/citer/fetchers.py b/build/lib/citer/fetchers.py new file mode 100644 index 0000000..ad280cf --- /dev/null +++ b/build/lib/citer/fetchers.py @@ -0,0 +1,158 @@ +import datetime as _dt +from typing import List, Optional +from urllib.parse import quote +from xml.etree import ElementTree + +import requests + +from .exceptions import CitationError +from .models import Author, WorkMetadata + + +ARXIV_API = "http://export.arxiv.org/api/query" +CROSSREF_WORKS = "https://api.crossref.org/works/" +USER_AGENT = "citer-cli/0.1 (mailto:unknown@example.com)" + + +def fetch_arxiv(arxiv_id: str) -> WorkMetadata: + try: + response = requests.get( + ARXIV_API, + params={"id_list": arxiv_id}, + headers={"User-Agent": USER_AGENT}, + timeout=10, + ) + response.raise_for_status() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach arXiv API: {exc}") from exc + + try: + root = ElementTree.fromstring(response.text) + except ElementTree.ParseError as exc: + raise CitationError("Received invalid XML from arXiv") from exc + + ns = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", + } + entry = root.find("atom:entry", ns) + if entry is None: + raise CitationError(f"No entry found for arXiv ID {arxiv_id}") + + title = _clean(entry.findtext("atom:title", default="", namespaces=ns)) + authors = _parse_arxiv_authors(entry.findall("atom:author", ns)) + year = _parse_year(entry.findtext("atom:published", default="", namespaces=ns)) + doi = _clean(entry.findtext("arxiv:doi", default="", namespaces=ns)) or None + + url = f"https://arxiv.org/abs/{arxiv_id}" + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title="arXiv preprint", + doi=doi, + url=url, + source="arxiv", + identifier=arxiv_id, + ) + + +def fetch_doi(doi: str) -> WorkMetadata: + url = CROSSREF_WORKS + quote(doi) + try: + response = requests.get( + url, headers={"Accept": "application/json", "User-Agent": USER_AGENT}, timeout=10 + ) + response.raise_for_status() + data = response.json() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach Crossref for DOI {doi}: {exc}") from exc + except ValueError as exc: + raise CitationError("Received invalid JSON from Crossref") from exc + + message = data.get("message", {}) + title = _clean(" ".join(message.get("title", [])).strip()) + authors = _parse_crossref_authors(message.get("author", [])) + year = _extract_year(message) + container_title = _clean((message.get("container-title") or [""])[0]) + volume = _clean(message.get("volume", "")).strip() or None + issue = _clean(message.get("issue", "")).strip() or None + pages = _clean(message.get("page", "")).strip() or None + url = message.get("URL") or f"https://doi.org/{doi}" + + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title=container_title or None, + volume=volume, + issue=issue, + pages=pages, + doi=doi, + url=url, + source="doi", + identifier=doi, + ) + + +def _clean(value: str) -> str: + return " ".join(value.split()) + + +def _parse_arxiv_authors(author_elements) -> List[Author]: + authors: List[Author] = [] + for element in author_elements: + name = element.findtext("{http://www.w3.org/2005/Atom}name", default="") + given, family = _split_author_name(name) + authors.append(Author(given=given, family=family)) + return authors + + +def _parse_crossref_authors(author_data) -> List[Author]: + authors: List[Author] = [] + for author in author_data: + if "family" in author or "given" in author: + given = author.get("given", "").strip() + family = author.get("family", "").strip() + elif "name" in author: + given, family = _split_author_name(author.get("name", "")) + else: + continue + authors.append(Author(given=given, family=family)) + return authors + + +def _split_author_name(name: str) -> tuple[str, str]: + clean_name = _clean(name) + if not clean_name: + return "", "" + if "," in clean_name: + family, given = [part.strip() for part in clean_name.split(",", 1)] + else: + parts = clean_name.split() + family = parts[-1] + given = " ".join(parts[:-1]) + return given, family + + +def _parse_year(value: str) -> Optional[int]: + value = value.strip() + if not value: + return None + try: + return _dt.datetime.fromisoformat(value).year + except ValueError: + if len(value) >= 4 and value[:4].isdigit(): + return int(value[:4]) + return None + + +def _extract_year(message: dict) -> int | None: + for key in ("published-print", "published-online", "issued"): + data = message.get(key, {}) + parts = data.get("date-parts") if isinstance(data, dict) else None + if parts and len(parts) and len(parts[0]): + maybe_year = parts[0][0] + if isinstance(maybe_year, int): + return maybe_year + return None diff --git a/build/lib/citer/formatter.py b/build/lib/citer/formatter.py new file mode 100644 index 0000000..8f2e94c --- /dev/null +++ b/build/lib/citer/formatter.py @@ -0,0 +1,90 @@ +from typing import Iterable + +from .models import Author, WorkMetadata + + +def format_apa(metadata: WorkMetadata) -> str: + authors_text = format_authors(metadata.authors) + year_text = f"({metadata.year})." if metadata.year else "(n.d.)." + title_text = _sentence_case(metadata.title).rstrip(".") + "." + container_text = _format_container(metadata) + + parts = [authors_text, year_text, title_text] + if container_text: + parts.append(container_text) + + link = None + if metadata.doi: + link = f"https://doi.org/{metadata.doi}" + elif metadata.url: + link = metadata.url + if link: + parts.append(link) + + return " ".join(part.strip() for part in parts if part).strip() + + +def format_authors(authors: Iterable[Author]) -> str: + formatted = [format_author(author) for author in authors if format_author(author)] + if not formatted: + return "" + if len(formatted) == 1: + return formatted[0] + if len(formatted) == 2: + return f"{formatted[0]}, & {formatted[1]}" + return ", ".join(formatted[:-1]) + f", & {formatted[-1]}" + + +def format_author(author: Author) -> str: + given_initials = " ".join(_initial(part) for part in author.given.split() if part) + family = author.family.strip() + if family and given_initials: + return f"{family}, {given_initials}" + if family: + return family + return given_initials + + +def _initial(part: str) -> str: + clean = part.strip() + if not clean: + return "" + return f"{clean[0].upper()}." + + +def _sentence_case(text: str) -> str: + stripped = text.strip() + if not stripped: + return stripped + lower = stripped[0].upper() + stripped[1:] + return lower + + +def _format_container(metadata: WorkMetadata) -> str: + if not metadata.container_title and not metadata.volume and not metadata.pages: + return "" + + pieces = [] + if metadata.container_title: + pieces.append(metadata.container_title) + volume_issue = "" + if metadata.volume: + volume_issue = metadata.volume + if metadata.issue: + volume_issue += f"({metadata.issue})" + elif metadata.issue: + volume_issue = f"({metadata.issue})" + + if volume_issue: + pieces.append(volume_issue) + + if metadata.source == "arxiv" and metadata.identifier: + pieces.append(f"arXiv:{metadata.identifier}") + + if metadata.pages: + pieces.append(metadata.pages) + + container = ", ".join(pieces) + if container and not container.endswith("."): + container += "." + return container diff --git a/build/lib/citer/identifiers.py b/build/lib/citer/identifiers.py new file mode 100644 index 0000000..913861a --- /dev/null +++ b/build/lib/citer/identifiers.py @@ -0,0 +1,50 @@ +import re +from urllib.parse import unquote + +from .exceptions import CitationError + + +ARXIV_ID = re.compile( + r"^(?P(?:\d{4}\.\d{4,5}|[a-z-]+\/\d{7}))(?:v\d+)?$", + flags=re.IGNORECASE, +) +DOI_PATTERN = re.compile(r"10\.\d{4,9}/\S+", flags=re.IGNORECASE) + + +def normalize_arxiv_identifier(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + url_match = re.search(r"arxiv\.org/(?:abs|pdf)/(?P[^?#/]+)", text, re.IGNORECASE) + if url_match: + text = url_match.group("id") + + text = re.sub(r"(?i)^arxiv:", "", text) + text = re.sub(r"(?i)\.pdf$", "", text) + text = text.split("?")[0] + text = text.strip() + + match = ARXIV_ID.match(text) + if not match: + raise CitationError(f"Could not understand arXiv identifier: {raw}") + + base_id = match.group("id") + base_id = re.sub(r"(?i)v\d+$", "", base_id) + return base_id + + +def normalize_doi(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + text = re.sub(r"(?i)^https?://(dx\.)?doi\.org/", "", text) + text = re.sub(r"(?i)^doi:\s*", "", text) + text = text.split("?")[0] + text = text.strip() + + match = DOI_PATTERN.search(text) + if not match: + raise CitationError(f"Could not understand DOI: {raw}") + + doi = match.group(0) + return doi.lower() diff --git a/build/lib/citer/models.py b/build/lib/citer/models.py new file mode 100644 index 0000000..b910877 --- /dev/null +++ b/build/lib/citer/models.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from typing import List, Optional + + +@dataclass +class Author: + given: str + family: str + + @property + def full_name(self) -> str: + return f"{self.given} {self.family}".strip() + + +@dataclass +class WorkMetadata: + title: str + authors: List[Author] + year: Optional[int] + container_title: Optional[str] = None + volume: Optional[str] = None + issue: Optional[str] = None + pages: Optional[str] = None + doi: Optional[str] = None + url: Optional[str] = None + source: str = "" + identifier: Optional[str] = None diff --git a/citer.egg-info/PKG-INFO b/citer.egg-info/PKG-INFO new file mode 100644 index 0000000..33417b0 --- /dev/null +++ b/citer.egg-info/PKG-INFO @@ -0,0 +1,37 @@ +Metadata-Version: 2.4 +Name: citer +Version: 0.1.0 +Summary: CLI to create APA style citations from arXiv IDs or DOIs +Requires-Python: >=3.9 +Description-Content-Type: text/markdown +Requires-Dist: requests>=2.31.0 + +# Citer + +Simple CLI that turns an arXiv ID/URL or DOI/URL into a single-line APA citation. + +## Setup + +```bash +pip install -e . +``` + +## Usage + +```bash +# arXiv IDs or URLs +citer arxiv 2106.01342 +citer arxiv https://arxiv.org/abs/2106.01342 + +# DOIs or DOI URLs +citer doi 10.1038/nphys1170 +citer doi https://doi.org/10.1038/nphys1170 +``` + +Errors are printed with a clear message if an ID cannot be parsed or a lookup fails. + +## Tests + +```bash +python -m pytest +``` diff --git a/citer.egg-info/SOURCES.txt b/citer.egg-info/SOURCES.txt new file mode 100644 index 0000000..8a2f550 --- /dev/null +++ b/citer.egg-info/SOURCES.txt @@ -0,0 +1,17 @@ +README.md +pyproject.toml +citer/__init__.py +citer/cli.py +citer/exceptions.py +citer/fetchers.py +citer/formatter.py +citer/identifiers.py +citer/models.py +citer.egg-info/PKG-INFO +citer.egg-info/SOURCES.txt +citer.egg-info/dependency_links.txt +citer.egg-info/entry_points.txt +citer.egg-info/requires.txt +citer.egg-info/top_level.txt +tests/test_formatter.py +tests/test_identifiers.py \ No newline at end of file diff --git a/citer.egg-info/dependency_links.txt b/citer.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/citer.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/citer.egg-info/entry_points.txt b/citer.egg-info/entry_points.txt new file mode 100644 index 0000000..ea2117c --- /dev/null +++ b/citer.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +citer = citer.cli:main diff --git a/citer.egg-info/requires.txt b/citer.egg-info/requires.txt new file mode 100644 index 0000000..0eb8cae --- /dev/null +++ b/citer.egg-info/requires.txt @@ -0,0 +1 @@ +requests>=2.31.0 diff --git a/citer.egg-info/top_level.txt b/citer.egg-info/top_level.txt new file mode 100644 index 0000000..0702aa6 --- /dev/null +++ b/citer.egg-info/top_level.txt @@ -0,0 +1 @@ +citer diff --git a/citer/__init__.py b/citer/__init__.py new file mode 100644 index 0000000..747cc0f --- /dev/null +++ b/citer/__init__.py @@ -0,0 +1,3 @@ +"""APA citation helper CLI.""" + +__all__ = ["cli", "fetchers", "formatter", "identifiers", "models"] diff --git a/citer/__pycache__/__init__.cpython-313.pyc b/citer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000..3102582 Binary files /dev/null and b/citer/__pycache__/__init__.cpython-313.pyc differ diff --git a/citer/__pycache__/exceptions.cpython-313.pyc b/citer/__pycache__/exceptions.cpython-313.pyc new file mode 100644 index 0000000..e230940 Binary files /dev/null and b/citer/__pycache__/exceptions.cpython-313.pyc differ diff --git a/citer/__pycache__/formatter.cpython-313.pyc b/citer/__pycache__/formatter.cpython-313.pyc new file mode 100644 index 0000000..78f6ced Binary files /dev/null and b/citer/__pycache__/formatter.cpython-313.pyc differ diff --git a/citer/__pycache__/identifiers.cpython-313.pyc b/citer/__pycache__/identifiers.cpython-313.pyc new file mode 100644 index 0000000..24b9bbb Binary files /dev/null and b/citer/__pycache__/identifiers.cpython-313.pyc differ diff --git a/citer/__pycache__/models.cpython-313.pyc b/citer/__pycache__/models.cpython-313.pyc new file mode 100644 index 0000000..bbfbd44 Binary files /dev/null and b/citer/__pycache__/models.cpython-313.pyc differ diff --git a/citer/cli.py b/citer/cli.py new file mode 100644 index 0000000..5b38122 --- /dev/null +++ b/citer/cli.py @@ -0,0 +1,48 @@ +import argparse +import sys + +from .exceptions import CitationError +from .fetchers import fetch_arxiv, fetch_doi +from .formatter import format_apa +from .identifiers import normalize_arxiv_identifier, normalize_doi + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Create an APA style citation from an arXiv ID or DOI." + ) + subparsers = parser.add_subparsers(dest="source", required=True) + + arxiv_parser = subparsers.add_parser("arxiv", help="Cite an arXiv identifier or URL") + arxiv_parser.add_argument("identifier", help="arXiv ID or URL") + + doi_parser = subparsers.add_parser("doi", help="Cite a DOI or DOI URL") + doi_parser.add_argument("identifier", help="DOI or DOI URL") + + return parser + + +def main(argv=None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + try: + if args.source == "arxiv": + arxiv_id = normalize_arxiv_identifier(args.identifier) + metadata = fetch_arxiv(arxiv_id) + elif args.source == "doi": + doi = normalize_doi(args.identifier) + metadata = fetch_doi(doi) + else: + parser.error("Unsupported source") + return 1 + except CitationError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + print(format_apa(metadata)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/citer/exceptions.py b/citer/exceptions.py new file mode 100644 index 0000000..6ac4e4b --- /dev/null +++ b/citer/exceptions.py @@ -0,0 +1,2 @@ +class CitationError(Exception): + """Raised when a citation cannot be created from the provided input.""" diff --git a/citer/fetchers.py b/citer/fetchers.py new file mode 100644 index 0000000..ad280cf --- /dev/null +++ b/citer/fetchers.py @@ -0,0 +1,158 @@ +import datetime as _dt +from typing import List, Optional +from urllib.parse import quote +from xml.etree import ElementTree + +import requests + +from .exceptions import CitationError +from .models import Author, WorkMetadata + + +ARXIV_API = "http://export.arxiv.org/api/query" +CROSSREF_WORKS = "https://api.crossref.org/works/" +USER_AGENT = "citer-cli/0.1 (mailto:unknown@example.com)" + + +def fetch_arxiv(arxiv_id: str) -> WorkMetadata: + try: + response = requests.get( + ARXIV_API, + params={"id_list": arxiv_id}, + headers={"User-Agent": USER_AGENT}, + timeout=10, + ) + response.raise_for_status() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach arXiv API: {exc}") from exc + + try: + root = ElementTree.fromstring(response.text) + except ElementTree.ParseError as exc: + raise CitationError("Received invalid XML from arXiv") from exc + + ns = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", + } + entry = root.find("atom:entry", ns) + if entry is None: + raise CitationError(f"No entry found for arXiv ID {arxiv_id}") + + title = _clean(entry.findtext("atom:title", default="", namespaces=ns)) + authors = _parse_arxiv_authors(entry.findall("atom:author", ns)) + year = _parse_year(entry.findtext("atom:published", default="", namespaces=ns)) + doi = _clean(entry.findtext("arxiv:doi", default="", namespaces=ns)) or None + + url = f"https://arxiv.org/abs/{arxiv_id}" + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title="arXiv preprint", + doi=doi, + url=url, + source="arxiv", + identifier=arxiv_id, + ) + + +def fetch_doi(doi: str) -> WorkMetadata: + url = CROSSREF_WORKS + quote(doi) + try: + response = requests.get( + url, headers={"Accept": "application/json", "User-Agent": USER_AGENT}, timeout=10 + ) + response.raise_for_status() + data = response.json() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach Crossref for DOI {doi}: {exc}") from exc + except ValueError as exc: + raise CitationError("Received invalid JSON from Crossref") from exc + + message = data.get("message", {}) + title = _clean(" ".join(message.get("title", [])).strip()) + authors = _parse_crossref_authors(message.get("author", [])) + year = _extract_year(message) + container_title = _clean((message.get("container-title") or [""])[0]) + volume = _clean(message.get("volume", "")).strip() or None + issue = _clean(message.get("issue", "")).strip() or None + pages = _clean(message.get("page", "")).strip() or None + url = message.get("URL") or f"https://doi.org/{doi}" + + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title=container_title or None, + volume=volume, + issue=issue, + pages=pages, + doi=doi, + url=url, + source="doi", + identifier=doi, + ) + + +def _clean(value: str) -> str: + return " ".join(value.split()) + + +def _parse_arxiv_authors(author_elements) -> List[Author]: + authors: List[Author] = [] + for element in author_elements: + name = element.findtext("{http://www.w3.org/2005/Atom}name", default="") + given, family = _split_author_name(name) + authors.append(Author(given=given, family=family)) + return authors + + +def _parse_crossref_authors(author_data) -> List[Author]: + authors: List[Author] = [] + for author in author_data: + if "family" in author or "given" in author: + given = author.get("given", "").strip() + family = author.get("family", "").strip() + elif "name" in author: + given, family = _split_author_name(author.get("name", "")) + else: + continue + authors.append(Author(given=given, family=family)) + return authors + + +def _split_author_name(name: str) -> tuple[str, str]: + clean_name = _clean(name) + if not clean_name: + return "", "" + if "," in clean_name: + family, given = [part.strip() for part in clean_name.split(",", 1)] + else: + parts = clean_name.split() + family = parts[-1] + given = " ".join(parts[:-1]) + return given, family + + +def _parse_year(value: str) -> Optional[int]: + value = value.strip() + if not value: + return None + try: + return _dt.datetime.fromisoformat(value).year + except ValueError: + if len(value) >= 4 and value[:4].isdigit(): + return int(value[:4]) + return None + + +def _extract_year(message: dict) -> int | None: + for key in ("published-print", "published-online", "issued"): + data = message.get(key, {}) + parts = data.get("date-parts") if isinstance(data, dict) else None + if parts and len(parts) and len(parts[0]): + maybe_year = parts[0][0] + if isinstance(maybe_year, int): + return maybe_year + return None diff --git a/citer/formatter.py b/citer/formatter.py new file mode 100644 index 0000000..8f2e94c --- /dev/null +++ b/citer/formatter.py @@ -0,0 +1,90 @@ +from typing import Iterable + +from .models import Author, WorkMetadata + + +def format_apa(metadata: WorkMetadata) -> str: + authors_text = format_authors(metadata.authors) + year_text = f"({metadata.year})." if metadata.year else "(n.d.)." + title_text = _sentence_case(metadata.title).rstrip(".") + "." + container_text = _format_container(metadata) + + parts = [authors_text, year_text, title_text] + if container_text: + parts.append(container_text) + + link = None + if metadata.doi: + link = f"https://doi.org/{metadata.doi}" + elif metadata.url: + link = metadata.url + if link: + parts.append(link) + + return " ".join(part.strip() for part in parts if part).strip() + + +def format_authors(authors: Iterable[Author]) -> str: + formatted = [format_author(author) for author in authors if format_author(author)] + if not formatted: + return "" + if len(formatted) == 1: + return formatted[0] + if len(formatted) == 2: + return f"{formatted[0]}, & {formatted[1]}" + return ", ".join(formatted[:-1]) + f", & {formatted[-1]}" + + +def format_author(author: Author) -> str: + given_initials = " ".join(_initial(part) for part in author.given.split() if part) + family = author.family.strip() + if family and given_initials: + return f"{family}, {given_initials}" + if family: + return family + return given_initials + + +def _initial(part: str) -> str: + clean = part.strip() + if not clean: + return "" + return f"{clean[0].upper()}." + + +def _sentence_case(text: str) -> str: + stripped = text.strip() + if not stripped: + return stripped + lower = stripped[0].upper() + stripped[1:] + return lower + + +def _format_container(metadata: WorkMetadata) -> str: + if not metadata.container_title and not metadata.volume and not metadata.pages: + return "" + + pieces = [] + if metadata.container_title: + pieces.append(metadata.container_title) + volume_issue = "" + if metadata.volume: + volume_issue = metadata.volume + if metadata.issue: + volume_issue += f"({metadata.issue})" + elif metadata.issue: + volume_issue = f"({metadata.issue})" + + if volume_issue: + pieces.append(volume_issue) + + if metadata.source == "arxiv" and metadata.identifier: + pieces.append(f"arXiv:{metadata.identifier}") + + if metadata.pages: + pieces.append(metadata.pages) + + container = ", ".join(pieces) + if container and not container.endswith("."): + container += "." + return container diff --git a/citer/identifiers.py b/citer/identifiers.py new file mode 100644 index 0000000..913861a --- /dev/null +++ b/citer/identifiers.py @@ -0,0 +1,50 @@ +import re +from urllib.parse import unquote + +from .exceptions import CitationError + + +ARXIV_ID = re.compile( + r"^(?P(?:\d{4}\.\d{4,5}|[a-z-]+\/\d{7}))(?:v\d+)?$", + flags=re.IGNORECASE, +) +DOI_PATTERN = re.compile(r"10\.\d{4,9}/\S+", flags=re.IGNORECASE) + + +def normalize_arxiv_identifier(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + url_match = re.search(r"arxiv\.org/(?:abs|pdf)/(?P[^?#/]+)", text, re.IGNORECASE) + if url_match: + text = url_match.group("id") + + text = re.sub(r"(?i)^arxiv:", "", text) + text = re.sub(r"(?i)\.pdf$", "", text) + text = text.split("?")[0] + text = text.strip() + + match = ARXIV_ID.match(text) + if not match: + raise CitationError(f"Could not understand arXiv identifier: {raw}") + + base_id = match.group("id") + base_id = re.sub(r"(?i)v\d+$", "", base_id) + return base_id + + +def normalize_doi(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + text = re.sub(r"(?i)^https?://(dx\.)?doi\.org/", "", text) + text = re.sub(r"(?i)^doi:\s*", "", text) + text = text.split("?")[0] + text = text.strip() + + match = DOI_PATTERN.search(text) + if not match: + raise CitationError(f"Could not understand DOI: {raw}") + + doi = match.group(0) + return doi.lower() diff --git a/citer/models.py b/citer/models.py new file mode 100644 index 0000000..b910877 --- /dev/null +++ b/citer/models.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from typing import List, Optional + + +@dataclass +class Author: + given: str + family: str + + @property + def full_name(self) -> str: + return f"{self.given} {self.family}".strip() + + +@dataclass +class WorkMetadata: + title: str + authors: List[Author] + year: Optional[int] + container_title: Optional[str] = None + volume: Optional[str] = None + issue: Optional[str] = None + pages: Optional[str] = None + doi: Optional[str] = None + url: Optional[str] = None + source: str = "" + identifier: Optional[str] = None diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7dbfff9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "citer" +version = "0.1.0" +description = "CLI to create APA style citations from arXiv IDs or DOIs" +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "requests>=2.31.0", +] + +[project.scripts] +citer = "citer.cli:main" + +[tool.pytest.ini_options] +addopts = "-q" +testpaths = ["tests"] diff --git a/tests/__pycache__/test_formatter.cpython-313-pytest-8.3.5.pyc b/tests/__pycache__/test_formatter.cpython-313-pytest-8.3.5.pyc new file mode 100644 index 0000000..6d628b9 Binary files /dev/null and b/tests/__pycache__/test_formatter.cpython-313-pytest-8.3.5.pyc differ diff --git a/tests/__pycache__/test_identifiers.cpython-313-pytest-8.3.5.pyc b/tests/__pycache__/test_identifiers.cpython-313-pytest-8.3.5.pyc new file mode 100644 index 0000000..690939c Binary files /dev/null and b/tests/__pycache__/test_identifiers.cpython-313-pytest-8.3.5.pyc differ diff --git a/tests/test_formatter.py b/tests/test_formatter.py new file mode 100644 index 0000000..69728fb --- /dev/null +++ b/tests/test_formatter.py @@ -0,0 +1,44 @@ +from citer.formatter import format_apa +from citer.models import Author, WorkMetadata + + +def test_format_doi_article(): + metadata = WorkMetadata( + title="Sample study on testing", + authors=[Author("Jane", "Doe"), Author("John", "Smith")], + year=2020, + container_title="Journal of Tests", + volume="12", + issue="3", + pages="45-67", + doi="10.1234/example.doi", + url="https://doi.org/10.1234/example.doi", + source="doi", + identifier="10.1234/example.doi", + ) + + citation = format_apa(metadata) + assert ( + citation + == "Doe, J., & Smith, J. (2020). Sample study on testing. " + "Journal of Tests, 12(3), 45-67. https://doi.org/10.1234/example.doi" + ) + + +def test_format_arxiv_preprint(): + metadata = WorkMetadata( + title="Deep learning for cats", + authors=[Author("Alice", "Nguyen"), Author("Bob", "Smith")], + year=2021, + container_title="arXiv preprint", + url="https://arxiv.org/abs/2101.00001", + source="arxiv", + identifier="2101.00001", + ) + + citation = format_apa(metadata) + assert ( + citation + == "Nguyen, A., & Smith, B. (2021). Deep learning for cats. " + "arXiv preprint, arXiv:2101.00001. https://arxiv.org/abs/2101.00001" + ) diff --git a/tests/test_identifiers.py b/tests/test_identifiers.py new file mode 100644 index 0000000..8f39ec3 --- /dev/null +++ b/tests/test_identifiers.py @@ -0,0 +1,25 @@ +import pytest + +from citer.identifiers import normalize_arxiv_identifier, normalize_doi + + +def test_normalize_arxiv_variants(): + assert normalize_arxiv_identifier("2106.01342v2") == "2106.01342" + assert normalize_arxiv_identifier("https://arxiv.org/abs/2106.01342") == "2106.01342" + assert normalize_arxiv_identifier("arXiv:hep-th/9901001") == "hep-th/9901001" + + +def test_normalize_arxiv_invalid(): + with pytest.raises(Exception): + normalize_arxiv_identifier("not-an-id") + + +def test_normalize_doi_variants(): + assert normalize_doi("https://doi.org/10.1038/nphys1170") == "10.1038/nphys1170" + assert normalize_doi("DOI:10.5555/12345678") == "10.5555/12345678" + assert normalize_doi("10.1000/182") == "10.1000/182" + + +def test_normalize_doi_invalid(): + with pytest.raises(Exception): + normalize_doi("not-a-doi")