From 42996b0f4e7efbdc22b5c1a08f79d3b40d27e71b Mon Sep 17 00:00:00 2001 From: Alex Selimov Date: Thu, 12 Feb 2026 23:55:07 -0500 Subject: [PATCH] Initial version --- README.md | 29 ++++ build/lib/citer/__init__.py | 3 + build/lib/citer/cli.py | 48 ++++++ build/lib/citer/exceptions.py | 2 + build/lib/citer/fetchers.py | 158 ++++++++++++++++++ build/lib/citer/formatter.py | 90 ++++++++++ build/lib/citer/identifiers.py | 50 ++++++ build/lib/citer/models.py | 27 +++ citer.egg-info/PKG-INFO | 37 ++++ citer.egg-info/SOURCES.txt | 17 ++ citer.egg-info/dependency_links.txt | 1 + citer.egg-info/entry_points.txt | 2 + citer.egg-info/requires.txt | 1 + citer.egg-info/top_level.txt | 1 + citer/__init__.py | 3 + citer/__pycache__/__init__.cpython-313.pyc | Bin 0 -> 251 bytes citer/__pycache__/exceptions.cpython-313.pyc | Bin 0 -> 473 bytes citer/__pycache__/formatter.cpython-313.pyc | Bin 0 -> 4836 bytes citer/__pycache__/identifiers.cpython-313.pyc | Bin 0 -> 2456 bytes citer/__pycache__/models.cpython-313.pyc | Bin 0 -> 1567 bytes citer/cli.py | 48 ++++++ citer/exceptions.py | 2 + citer/fetchers.py | 158 ++++++++++++++++++ citer/formatter.py | 90 ++++++++++ citer/identifiers.py | 50 ++++++ citer/models.py | 27 +++ pyproject.toml | 20 +++ ...est_formatter.cpython-313-pytest-8.3.5.pyc | Bin 0 -> 2969 bytes ...t_identifiers.cpython-313-pytest-8.3.5.pyc | Bin 0 -> 5598 bytes tests/test_formatter.py | 44 +++++ tests/test_identifiers.py | 25 +++ 31 files changed, 933 insertions(+) create mode 100644 README.md create mode 100644 build/lib/citer/__init__.py create mode 100644 build/lib/citer/cli.py create mode 100644 build/lib/citer/exceptions.py create mode 100644 build/lib/citer/fetchers.py create mode 100644 build/lib/citer/formatter.py create mode 100644 build/lib/citer/identifiers.py create mode 100644 build/lib/citer/models.py create mode 100644 citer.egg-info/PKG-INFO create mode 100644 citer.egg-info/SOURCES.txt create mode 100644 citer.egg-info/dependency_links.txt create mode 100644 citer.egg-info/entry_points.txt create mode 100644 citer.egg-info/requires.txt create mode 100644 citer.egg-info/top_level.txt create mode 100644 citer/__init__.py create mode 100644 citer/__pycache__/__init__.cpython-313.pyc create mode 100644 citer/__pycache__/exceptions.cpython-313.pyc create mode 100644 citer/__pycache__/formatter.cpython-313.pyc create mode 100644 citer/__pycache__/identifiers.cpython-313.pyc create mode 100644 citer/__pycache__/models.cpython-313.pyc create mode 100644 citer/cli.py create mode 100644 citer/exceptions.py create mode 100644 citer/fetchers.py create mode 100644 citer/formatter.py create mode 100644 citer/identifiers.py create mode 100644 citer/models.py create mode 100644 pyproject.toml create mode 100644 tests/__pycache__/test_formatter.cpython-313-pytest-8.3.5.pyc create mode 100644 tests/__pycache__/test_identifiers.cpython-313-pytest-8.3.5.pyc create mode 100644 tests/test_formatter.py create mode 100644 tests/test_identifiers.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e3f2252 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Citer + +Simple CLI that turns an arXiv ID/URL or DOI/URL into a single-line APA citation. + +## Setup + +```bash +pip install -e . +``` + +## Usage + +```bash +# arXiv IDs or URLs +citer arxiv 2106.01342 +citer arxiv https://arxiv.org/abs/2106.01342 + +# DOIs or DOI URLs +citer doi 10.1038/nphys1170 +citer doi https://doi.org/10.1038/nphys1170 +``` + +Errors are printed with a clear message if an ID cannot be parsed or a lookup fails. + +## Tests + +```bash +python -m pytest +``` diff --git a/build/lib/citer/__init__.py b/build/lib/citer/__init__.py new file mode 100644 index 0000000..747cc0f --- /dev/null +++ b/build/lib/citer/__init__.py @@ -0,0 +1,3 @@ +"""APA citation helper CLI.""" + +__all__ = ["cli", "fetchers", "formatter", "identifiers", "models"] diff --git a/build/lib/citer/cli.py b/build/lib/citer/cli.py new file mode 100644 index 0000000..5b38122 --- /dev/null +++ b/build/lib/citer/cli.py @@ -0,0 +1,48 @@ +import argparse +import sys + +from .exceptions import CitationError +from .fetchers import fetch_arxiv, fetch_doi +from .formatter import format_apa +from .identifiers import normalize_arxiv_identifier, normalize_doi + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Create an APA style citation from an arXiv ID or DOI." + ) + subparsers = parser.add_subparsers(dest="source", required=True) + + arxiv_parser = subparsers.add_parser("arxiv", help="Cite an arXiv identifier or URL") + arxiv_parser.add_argument("identifier", help="arXiv ID or URL") + + doi_parser = subparsers.add_parser("doi", help="Cite a DOI or DOI URL") + doi_parser.add_argument("identifier", help="DOI or DOI URL") + + return parser + + +def main(argv=None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + try: + if args.source == "arxiv": + arxiv_id = normalize_arxiv_identifier(args.identifier) + metadata = fetch_arxiv(arxiv_id) + elif args.source == "doi": + doi = normalize_doi(args.identifier) + metadata = fetch_doi(doi) + else: + parser.error("Unsupported source") + return 1 + except CitationError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + print(format_apa(metadata)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/build/lib/citer/exceptions.py b/build/lib/citer/exceptions.py new file mode 100644 index 0000000..6ac4e4b --- /dev/null +++ b/build/lib/citer/exceptions.py @@ -0,0 +1,2 @@ +class CitationError(Exception): + """Raised when a citation cannot be created from the provided input.""" diff --git a/build/lib/citer/fetchers.py b/build/lib/citer/fetchers.py new file mode 100644 index 0000000..ad280cf --- /dev/null +++ b/build/lib/citer/fetchers.py @@ -0,0 +1,158 @@ +import datetime as _dt +from typing import List, Optional +from urllib.parse import quote +from xml.etree import ElementTree + +import requests + +from .exceptions import CitationError +from .models import Author, WorkMetadata + + +ARXIV_API = "http://export.arxiv.org/api/query" +CROSSREF_WORKS = "https://api.crossref.org/works/" +USER_AGENT = "citer-cli/0.1 (mailto:unknown@example.com)" + + +def fetch_arxiv(arxiv_id: str) -> WorkMetadata: + try: + response = requests.get( + ARXIV_API, + params={"id_list": arxiv_id}, + headers={"User-Agent": USER_AGENT}, + timeout=10, + ) + response.raise_for_status() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach arXiv API: {exc}") from exc + + try: + root = ElementTree.fromstring(response.text) + except ElementTree.ParseError as exc: + raise CitationError("Received invalid XML from arXiv") from exc + + ns = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", + } + entry = root.find("atom:entry", ns) + if entry is None: + raise CitationError(f"No entry found for arXiv ID {arxiv_id}") + + title = _clean(entry.findtext("atom:title", default="", namespaces=ns)) + authors = _parse_arxiv_authors(entry.findall("atom:author", ns)) + year = _parse_year(entry.findtext("atom:published", default="", namespaces=ns)) + doi = _clean(entry.findtext("arxiv:doi", default="", namespaces=ns)) or None + + url = f"https://arxiv.org/abs/{arxiv_id}" + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title="arXiv preprint", + doi=doi, + url=url, + source="arxiv", + identifier=arxiv_id, + ) + + +def fetch_doi(doi: str) -> WorkMetadata: + url = CROSSREF_WORKS + quote(doi) + try: + response = requests.get( + url, headers={"Accept": "application/json", "User-Agent": USER_AGENT}, timeout=10 + ) + response.raise_for_status() + data = response.json() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach Crossref for DOI {doi}: {exc}") from exc + except ValueError as exc: + raise CitationError("Received invalid JSON from Crossref") from exc + + message = data.get("message", {}) + title = _clean(" ".join(message.get("title", [])).strip()) + authors = _parse_crossref_authors(message.get("author", [])) + year = _extract_year(message) + container_title = _clean((message.get("container-title") or [""])[0]) + volume = _clean(message.get("volume", "")).strip() or None + issue = _clean(message.get("issue", "")).strip() or None + pages = _clean(message.get("page", "")).strip() or None + url = message.get("URL") or f"https://doi.org/{doi}" + + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title=container_title or None, + volume=volume, + issue=issue, + pages=pages, + doi=doi, + url=url, + source="doi", + identifier=doi, + ) + + +def _clean(value: str) -> str: + return " ".join(value.split()) + + +def _parse_arxiv_authors(author_elements) -> List[Author]: + authors: List[Author] = [] + for element in author_elements: + name = element.findtext("{http://www.w3.org/2005/Atom}name", default="") + given, family = _split_author_name(name) + authors.append(Author(given=given, family=family)) + return authors + + +def _parse_crossref_authors(author_data) -> List[Author]: + authors: List[Author] = [] + for author in author_data: + if "family" in author or "given" in author: + given = author.get("given", "").strip() + family = author.get("family", "").strip() + elif "name" in author: + given, family = _split_author_name(author.get("name", "")) + else: + continue + authors.append(Author(given=given, family=family)) + return authors + + +def _split_author_name(name: str) -> tuple[str, str]: + clean_name = _clean(name) + if not clean_name: + return "", "" + if "," in clean_name: + family, given = [part.strip() for part in clean_name.split(",", 1)] + else: + parts = clean_name.split() + family = parts[-1] + given = " ".join(parts[:-1]) + return given, family + + +def _parse_year(value: str) -> Optional[int]: + value = value.strip() + if not value: + return None + try: + return _dt.datetime.fromisoformat(value).year + except ValueError: + if len(value) >= 4 and value[:4].isdigit(): + return int(value[:4]) + return None + + +def _extract_year(message: dict) -> int | None: + for key in ("published-print", "published-online", "issued"): + data = message.get(key, {}) + parts = data.get("date-parts") if isinstance(data, dict) else None + if parts and len(parts) and len(parts[0]): + maybe_year = parts[0][0] + if isinstance(maybe_year, int): + return maybe_year + return None diff --git a/build/lib/citer/formatter.py b/build/lib/citer/formatter.py new file mode 100644 index 0000000..8f2e94c --- /dev/null +++ b/build/lib/citer/formatter.py @@ -0,0 +1,90 @@ +from typing import Iterable + +from .models import Author, WorkMetadata + + +def format_apa(metadata: WorkMetadata) -> str: + authors_text = format_authors(metadata.authors) + year_text = f"({metadata.year})." if metadata.year else "(n.d.)." + title_text = _sentence_case(metadata.title).rstrip(".") + "." + container_text = _format_container(metadata) + + parts = [authors_text, year_text, title_text] + if container_text: + parts.append(container_text) + + link = None + if metadata.doi: + link = f"https://doi.org/{metadata.doi}" + elif metadata.url: + link = metadata.url + if link: + parts.append(link) + + return " ".join(part.strip() for part in parts if part).strip() + + +def format_authors(authors: Iterable[Author]) -> str: + formatted = [format_author(author) for author in authors if format_author(author)] + if not formatted: + return "" + if len(formatted) == 1: + return formatted[0] + if len(formatted) == 2: + return f"{formatted[0]}, & {formatted[1]}" + return ", ".join(formatted[:-1]) + f", & {formatted[-1]}" + + +def format_author(author: Author) -> str: + given_initials = " ".join(_initial(part) for part in author.given.split() if part) + family = author.family.strip() + if family and given_initials: + return f"{family}, {given_initials}" + if family: + return family + return given_initials + + +def _initial(part: str) -> str: + clean = part.strip() + if not clean: + return "" + return f"{clean[0].upper()}." + + +def _sentence_case(text: str) -> str: + stripped = text.strip() + if not stripped: + return stripped + lower = stripped[0].upper() + stripped[1:] + return lower + + +def _format_container(metadata: WorkMetadata) -> str: + if not metadata.container_title and not metadata.volume and not metadata.pages: + return "" + + pieces = [] + if metadata.container_title: + pieces.append(metadata.container_title) + volume_issue = "" + if metadata.volume: + volume_issue = metadata.volume + if metadata.issue: + volume_issue += f"({metadata.issue})" + elif metadata.issue: + volume_issue = f"({metadata.issue})" + + if volume_issue: + pieces.append(volume_issue) + + if metadata.source == "arxiv" and metadata.identifier: + pieces.append(f"arXiv:{metadata.identifier}") + + if metadata.pages: + pieces.append(metadata.pages) + + container = ", ".join(pieces) + if container and not container.endswith("."): + container += "." + return container diff --git a/build/lib/citer/identifiers.py b/build/lib/citer/identifiers.py new file mode 100644 index 0000000..913861a --- /dev/null +++ b/build/lib/citer/identifiers.py @@ -0,0 +1,50 @@ +import re +from urllib.parse import unquote + +from .exceptions import CitationError + + +ARXIV_ID = re.compile( + r"^(?P(?:\d{4}\.\d{4,5}|[a-z-]+\/\d{7}))(?:v\d+)?$", + flags=re.IGNORECASE, +) +DOI_PATTERN = re.compile(r"10\.\d{4,9}/\S+", flags=re.IGNORECASE) + + +def normalize_arxiv_identifier(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + url_match = re.search(r"arxiv\.org/(?:abs|pdf)/(?P[^?#/]+)", text, re.IGNORECASE) + if url_match: + text = url_match.group("id") + + text = re.sub(r"(?i)^arxiv:", "", text) + text = re.sub(r"(?i)\.pdf$", "", text) + text = text.split("?")[0] + text = text.strip() + + match = ARXIV_ID.match(text) + if not match: + raise CitationError(f"Could not understand arXiv identifier: {raw}") + + base_id = match.group("id") + base_id = re.sub(r"(?i)v\d+$", "", base_id) + return base_id + + +def normalize_doi(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + text = re.sub(r"(?i)^https?://(dx\.)?doi\.org/", "", text) + text = re.sub(r"(?i)^doi:\s*", "", text) + text = text.split("?")[0] + text = text.strip() + + match = DOI_PATTERN.search(text) + if not match: + raise CitationError(f"Could not understand DOI: {raw}") + + doi = match.group(0) + return doi.lower() diff --git a/build/lib/citer/models.py b/build/lib/citer/models.py new file mode 100644 index 0000000..b910877 --- /dev/null +++ b/build/lib/citer/models.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from typing import List, Optional + + +@dataclass +class Author: + given: str + family: str + + @property + def full_name(self) -> str: + return f"{self.given} {self.family}".strip() + + +@dataclass +class WorkMetadata: + title: str + authors: List[Author] + year: Optional[int] + container_title: Optional[str] = None + volume: Optional[str] = None + issue: Optional[str] = None + pages: Optional[str] = None + doi: Optional[str] = None + url: Optional[str] = None + source: str = "" + identifier: Optional[str] = None diff --git a/citer.egg-info/PKG-INFO b/citer.egg-info/PKG-INFO new file mode 100644 index 0000000..33417b0 --- /dev/null +++ b/citer.egg-info/PKG-INFO @@ -0,0 +1,37 @@ +Metadata-Version: 2.4 +Name: citer +Version: 0.1.0 +Summary: CLI to create APA style citations from arXiv IDs or DOIs +Requires-Python: >=3.9 +Description-Content-Type: text/markdown +Requires-Dist: requests>=2.31.0 + +# Citer + +Simple CLI that turns an arXiv ID/URL or DOI/URL into a single-line APA citation. + +## Setup + +```bash +pip install -e . +``` + +## Usage + +```bash +# arXiv IDs or URLs +citer arxiv 2106.01342 +citer arxiv https://arxiv.org/abs/2106.01342 + +# DOIs or DOI URLs +citer doi 10.1038/nphys1170 +citer doi https://doi.org/10.1038/nphys1170 +``` + +Errors are printed with a clear message if an ID cannot be parsed or a lookup fails. + +## Tests + +```bash +python -m pytest +``` diff --git a/citer.egg-info/SOURCES.txt b/citer.egg-info/SOURCES.txt new file mode 100644 index 0000000..8a2f550 --- /dev/null +++ b/citer.egg-info/SOURCES.txt @@ -0,0 +1,17 @@ +README.md +pyproject.toml +citer/__init__.py +citer/cli.py +citer/exceptions.py +citer/fetchers.py +citer/formatter.py +citer/identifiers.py +citer/models.py +citer.egg-info/PKG-INFO +citer.egg-info/SOURCES.txt +citer.egg-info/dependency_links.txt +citer.egg-info/entry_points.txt +citer.egg-info/requires.txt +citer.egg-info/top_level.txt +tests/test_formatter.py +tests/test_identifiers.py \ No newline at end of file diff --git a/citer.egg-info/dependency_links.txt b/citer.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/citer.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/citer.egg-info/entry_points.txt b/citer.egg-info/entry_points.txt new file mode 100644 index 0000000..ea2117c --- /dev/null +++ b/citer.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +citer = citer.cli:main diff --git a/citer.egg-info/requires.txt b/citer.egg-info/requires.txt new file mode 100644 index 0000000..0eb8cae --- /dev/null +++ b/citer.egg-info/requires.txt @@ -0,0 +1 @@ +requests>=2.31.0 diff --git a/citer.egg-info/top_level.txt b/citer.egg-info/top_level.txt new file mode 100644 index 0000000..0702aa6 --- /dev/null +++ b/citer.egg-info/top_level.txt @@ -0,0 +1 @@ +citer diff --git a/citer/__init__.py b/citer/__init__.py new file mode 100644 index 0000000..747cc0f --- /dev/null +++ b/citer/__init__.py @@ -0,0 +1,3 @@ +"""APA citation helper CLI.""" + +__all__ = ["cli", "fetchers", "formatter", "identifiers", "models"] diff --git a/citer/__pycache__/__init__.cpython-313.pyc b/citer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31025820ec5a6c28bb46d766a4536f82bf7c542c GIT binary patch literal 251 zcmey&%ge<81gEE)WTpV=#~=<2FhLog1%QmH48aUV4EhYgjDd_rjOk38%vBPO0gejE znI(xOnfZAN8L2r1sYMFTKAw7-thbnxb24vnq@|W5XQUPt-{MTmFUn0UDM>B5#hsaw znpcvUmI)GP%gs+o%_;WNWV*#3AD@z+93Ky2B5a&1paa=~{gA3uPZrQ$d6 zlZe=u*pLD{cWvUqezxDUY}cOGC&oVBPDEKS5uK4!ch^I@ zbOJYS6*kf)sYeEE$c!M32s#l(H*&BOx!7HHf^Owq3eB{T`4ytXZ!3GgxMM=`#D7@v z+-H8=F#MS1xzzrg`!RB+4P${a^Yx1R11*0A*K;foY9mUTUrY(7vO#$5oSXB{Lo}#Z!^V zb%?x>Dm2@2Y&?IAd0~%Jli~^Ov$oD2sqCB=^4Yn;L0g{&PIYkAYIb}}+c7+Ys@~b| cj&mHEbUWzKfFGAq`s+}7OJA@5h>cs(KSY3g?f?J) literal 0 HcmV?d00001 diff --git a/citer/__pycache__/formatter.cpython-313.pyc b/citer/__pycache__/formatter.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78f6ced8c1a064e5038cac5384cd0c65d2372dd0 GIT binary patch literal 4836 zcmb7HO>7&-6`uVexf1_H$(CgOP_iPMu}s>Moml=+>e>lpS&|zul-=5iuoc(RI@FSx zr4(B*Y_>sQ5jl`*n~FtSggHgd#cq3XPwk;-dL&CBWEZYspg;@Xh}s33UfMUaTuP*A z109evvomkr%zOLI_uf9W+nW)TQ<)<&1HHeI2cz(n&eI=3=Tj6!LFPP~U@+rFK{m*p zXD2w!P4Jk9XZ{KwG+i+r;e+N*bdAU6pydj4g)_m*%D6zsEy1QMTxF!0jtD{P6;oxz zMn{^0_Ued(jx_Jk@1!Hv1((nLgdEVvXu`R;gvIYiq$dm+U^VmU<@jP0YqpC~{Qi^_ z7sFy))PyCYXU0-|87uQd+$h9a4*pM@Sz_(w>RbrTz({>LTA5cEm^VDNLuh}cFS@`3 zFOYG7A4j+gOpsI9%NAoX=o5_oc5Cc6W8EGzdO_Zp4e|$JP~k4yjkypq<`n)?ONa|_ ziYdfbPv=G`WSVgr(%WsQsuSKWQp|8Fu%y0J^$;sfjb$8YgjLKJx?#4Zemz1}4K_GQ zLkZa1T$Ot9dCi8Ma9;CF+w(gGEuCPi2OX_U8S*o`pO-ac29Qsf)|maPtk1t{*{}G+ zexF}s{Hw0Tcs!;a4+O$d*&oFV0gdt2z==R<1pZHFpxTSP0I3l2pz&^mqHe>SFaT=` zUTBD!slsjhFvcNP(5EIZn(9T!3VIgM>)ci5E;9|lH>q(f$L_Kk>p!INF%idA0#&b9 zI~Z7uE=d7Vl_K&|bR~eLSX2$n%WzRD_0=d|665gfkKNFkPA*7_bUlX0v~GQ3PK=5E zlac7W7*WUk)#-LnuXaK81SL?;bK=WT_MP8^e>|B!yD_mou`!vR{4(^7DuBQboA)v4 z-TN4yO>-D3>8(^Xi&*LKHzW~jjyY9Q;*v5i&CSDsHB(%UMDTE zk+5bKV=+kyYaDoAwQO6r~YiB zKGgV#th}!hHD0}74^i1{*a1@eV89H2^^Z^`&_hpOc7L&dINv{9@QfrTOTNSRy6<(T z`_khZQ|nXdH?sYC*DDXsCC)u`_hyAJ9G^Q1?m*G~THgKIeYM~|@nAmhKAo6&*fyBG zUT8a%n0{>U$X?6Y4kzAxWN9l2_LMDYOTV`z3_P|v6K6{n=bf8(ZkF7z?H5Cz4`q9D z&O=GQ)Z|WerMgPa-Km?&n``_di`7{A&X&;m*y>0fOCHO6PTUO@JSPg)6GiLky!CX! zdM3x8c?y;wOm0IBl8Kd+7+ru-9mTVZ$_9|i0-n7v(T3{}dU11=`1a%#P)p!#QENWOb%SlcVYHX4EX3ey?C1*lgXDAFy|q zHs^BtPUWqqa{Q@p{^>;Si;PP0h>u&p&GEli{fRkKX3eNYSLpbhb@fg5Wg+`b% zRfy4fWrpaeD+as>d`18$1SGG54CM<r6!USpG-oWu)5&BmmHq->EvQcNh$?LPtM+(`5V{wk@!2~JOHHM#o9&BXX4s|TXrnzLkUO~0KyU$k}SZQTW1Z+8FY z;mzUfP~PTGyjilk5|h*)jXgp9K|KN5>x)4O;k{seL#nDu0L(yw*jHDo5)?fFh@c=> zCqo_$IT3gtazv7l)Ro5TjA#j|Faxyq(o>3uUt|YeDb#HUkJVFoU?oIU(BD9c;+;DP z5m;d_^fi{LY9gNxFV#~+&g3JYu{WxDf|q!ngrvF>)boM>wV7~rU8$a^=S6$a&zuG; z$!E^C-(mNcy!V&B&j#zZ^+L|l=wq##btznktJ*+N!_9!V>PygKXf$NQaDhWy&_drA z=%61yI!_TVYK%`aiTJv_vT7FbC3)pIjRKUud`+5jM;fi??kmyA@)D%-F>ygsF$pl5 zNmkWmNbA+;GM<;TW;qO5zkF4eupUYT$mrD%<@lnH*E4u?OqS*)RkP{J=JfSV)xA{W zDUzS-0grr@U{bvSf7Jn33y8t$DO&f!Z*%qz|= zsxrqg%wN%-f1v$ka|?5p$#j(wX*c&iqiy+Bff>t;XJ_+WLuCa0`<`c{Detl{oteQh zf+ib$Mw)Uf&-AB9$_Sdw$TQNE9VW(|@strXSy%{7*|mph%i78an){xAk%k`gf1UAq ACjbBd literal 0 HcmV?d00001 diff --git a/citer/__pycache__/identifiers.cpython-313.pyc b/citer/__pycache__/identifiers.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24b9bbb4213e021ebdec82362d694573d8825524 GIT binary patch literal 2456 zcmcIlUu@Gx7(Y99{@Ha(%U~4fB9^h5(0D-0)<7FoX@yErKyfoM)lj*y+Zxh1J3FPz zsL12eKBSs7(Z@V)Z%mpr9(mlPNejxuDvu$iu?OBVb$j2P6DR4GioNV4-*@-j@4mb5 zyYKgTucyZkQ2X`ah!&s$_zh2-hURwL`v!?GfCxll7Ch@Q$siD2A~ox}N^#m}P=z zU*UriJYjkA*^l$OslFnR0?m8knkk!_o}GqJhlP>bL$Mo|HDzKb7L}AwM>eDg7H3B{ z?$67^h2dLaNkEGCH+UW))+8m&$3n;qy_AucjY8nud(91F8-gT;EfO$Y1o5V3qfn?`u6E)hDSz;bz?Uo`&n@FUk9BtC=jM@@KoJe(IyzMn-G}^SA zB~#>3Qz;(;AVnq_2h)}!+4bx$uY=A;8G-KV*zdeYcRalxpCXch_Q}pjbUQZ?Jvf9} zBfp67xr}C7-pS<6nGY9c zrYw3@Hq$E*KM+qt54J;@>bhz9@-VZ|Sj6GAJd3iSE@+B@p+Vd%L0HjORRQTU+N!=L zKsBcuLRvFb*zDUEHzK)Dtbwc!SLKXWP#5f2AajmmZ~&P_ZwZX=P*()I3|Ag2jcr{j zPCaA#wnw(ci&HOq`yTo}_f-etPv(9&|Ks@gaz)Blmst)EM> zXTL2x>mYPTNwW#vDkq}*UxX%;sObH(iL`UKEGiD&%7;^+5qUj{!k|sa71PWav8W&n zDeF>%k14v=;4{P(wZPgSq>M_&8HkHRgQ^2vsXBFiCK}}Z5PND965@1*eYnHr%wiB? z;l||L+;lPlv4=M6Aoj+hGx|Lh@|4X$cmyxntaG8$4gBlCgoDR0w5LrXvJ}M*b9H494>{o<*i6D-Xd_7Is%WQyYZ|Y5Q{~HC8I4yaxyg3M&uEKIf9-}H z6U&pQD_cpuSi~%_<9}jNKg$tEw~abLV#oLk3;P;>1BqbWF8}}l literal 0 HcmV?d00001 diff --git a/citer/__pycache__/models.cpython-313.pyc b/citer/__pycache__/models.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbfbd4416280d1af392976efc47479f02c059da0 GIT binary patch literal 1567 zcmaJ>L2nyX5PrM+cD=S^J8{xBDG7CHT-X4$)FcuIiUd&+a%d{LMQ~WOTAa1Ju--NA zts_~zSn8qi%{fFhC(isDiY$TkBt#@mxf%Jum6`QAas%p<{LRcayxUFAN;Nmh2Y`B{3$SfO1MK z!X%S0*_4=4CKk0+Q+^vfv8KeDW>Smp$R=d*9lgjcWok^v421S^J9-kbbF5mZ4e!Ve zgjRQ&p5GA~bE7s3YEp743d{h1lg9w3uuk+1E=YV#c92FE*Lz4Zvu45S@G(Nt&^?(`rwN}^wZNHpAMQY3-A42xch72 z?y#`)Qs0?C9Qm0X7-^x~@rJf7GPVszwS5=#1>63i?f7^USOwdzdyGfE7q~$PgaVZy z>jXg;T;A6`OiwLP1U^jUg;5j+V z=ATtZ3x#L6l!4*IdFB%o(5<~q2GoXQa#Sw zAX-kt>-AzVkG2q^?? zolv|WVz5W%S!)~Jz2nvX zt=`6vZj4^FiYu_o-jJ@0N}JtkUmq-= argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Create an APA style citation from an arXiv ID or DOI." + ) + subparsers = parser.add_subparsers(dest="source", required=True) + + arxiv_parser = subparsers.add_parser("arxiv", help="Cite an arXiv identifier or URL") + arxiv_parser.add_argument("identifier", help="arXiv ID or URL") + + doi_parser = subparsers.add_parser("doi", help="Cite a DOI or DOI URL") + doi_parser.add_argument("identifier", help="DOI or DOI URL") + + return parser + + +def main(argv=None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + try: + if args.source == "arxiv": + arxiv_id = normalize_arxiv_identifier(args.identifier) + metadata = fetch_arxiv(arxiv_id) + elif args.source == "doi": + doi = normalize_doi(args.identifier) + metadata = fetch_doi(doi) + else: + parser.error("Unsupported source") + return 1 + except CitationError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + print(format_apa(metadata)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/citer/exceptions.py b/citer/exceptions.py new file mode 100644 index 0000000..6ac4e4b --- /dev/null +++ b/citer/exceptions.py @@ -0,0 +1,2 @@ +class CitationError(Exception): + """Raised when a citation cannot be created from the provided input.""" diff --git a/citer/fetchers.py b/citer/fetchers.py new file mode 100644 index 0000000..ad280cf --- /dev/null +++ b/citer/fetchers.py @@ -0,0 +1,158 @@ +import datetime as _dt +from typing import List, Optional +from urllib.parse import quote +from xml.etree import ElementTree + +import requests + +from .exceptions import CitationError +from .models import Author, WorkMetadata + + +ARXIV_API = "http://export.arxiv.org/api/query" +CROSSREF_WORKS = "https://api.crossref.org/works/" +USER_AGENT = "citer-cli/0.1 (mailto:unknown@example.com)" + + +def fetch_arxiv(arxiv_id: str) -> WorkMetadata: + try: + response = requests.get( + ARXIV_API, + params={"id_list": arxiv_id}, + headers={"User-Agent": USER_AGENT}, + timeout=10, + ) + response.raise_for_status() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach arXiv API: {exc}") from exc + + try: + root = ElementTree.fromstring(response.text) + except ElementTree.ParseError as exc: + raise CitationError("Received invalid XML from arXiv") from exc + + ns = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", + } + entry = root.find("atom:entry", ns) + if entry is None: + raise CitationError(f"No entry found for arXiv ID {arxiv_id}") + + title = _clean(entry.findtext("atom:title", default="", namespaces=ns)) + authors = _parse_arxiv_authors(entry.findall("atom:author", ns)) + year = _parse_year(entry.findtext("atom:published", default="", namespaces=ns)) + doi = _clean(entry.findtext("arxiv:doi", default="", namespaces=ns)) or None + + url = f"https://arxiv.org/abs/{arxiv_id}" + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title="arXiv preprint", + doi=doi, + url=url, + source="arxiv", + identifier=arxiv_id, + ) + + +def fetch_doi(doi: str) -> WorkMetadata: + url = CROSSREF_WORKS + quote(doi) + try: + response = requests.get( + url, headers={"Accept": "application/json", "User-Agent": USER_AGENT}, timeout=10 + ) + response.raise_for_status() + data = response.json() + except requests.RequestException as exc: + raise CitationError(f"Failed to reach Crossref for DOI {doi}: {exc}") from exc + except ValueError as exc: + raise CitationError("Received invalid JSON from Crossref") from exc + + message = data.get("message", {}) + title = _clean(" ".join(message.get("title", [])).strip()) + authors = _parse_crossref_authors(message.get("author", [])) + year = _extract_year(message) + container_title = _clean((message.get("container-title") or [""])[0]) + volume = _clean(message.get("volume", "")).strip() or None + issue = _clean(message.get("issue", "")).strip() or None + pages = _clean(message.get("page", "")).strip() or None + url = message.get("URL") or f"https://doi.org/{doi}" + + return WorkMetadata( + title=title, + authors=authors, + year=year, + container_title=container_title or None, + volume=volume, + issue=issue, + pages=pages, + doi=doi, + url=url, + source="doi", + identifier=doi, + ) + + +def _clean(value: str) -> str: + return " ".join(value.split()) + + +def _parse_arxiv_authors(author_elements) -> List[Author]: + authors: List[Author] = [] + for element in author_elements: + name = element.findtext("{http://www.w3.org/2005/Atom}name", default="") + given, family = _split_author_name(name) + authors.append(Author(given=given, family=family)) + return authors + + +def _parse_crossref_authors(author_data) -> List[Author]: + authors: List[Author] = [] + for author in author_data: + if "family" in author or "given" in author: + given = author.get("given", "").strip() + family = author.get("family", "").strip() + elif "name" in author: + given, family = _split_author_name(author.get("name", "")) + else: + continue + authors.append(Author(given=given, family=family)) + return authors + + +def _split_author_name(name: str) -> tuple[str, str]: + clean_name = _clean(name) + if not clean_name: + return "", "" + if "," in clean_name: + family, given = [part.strip() for part in clean_name.split(",", 1)] + else: + parts = clean_name.split() + family = parts[-1] + given = " ".join(parts[:-1]) + return given, family + + +def _parse_year(value: str) -> Optional[int]: + value = value.strip() + if not value: + return None + try: + return _dt.datetime.fromisoformat(value).year + except ValueError: + if len(value) >= 4 and value[:4].isdigit(): + return int(value[:4]) + return None + + +def _extract_year(message: dict) -> int | None: + for key in ("published-print", "published-online", "issued"): + data = message.get(key, {}) + parts = data.get("date-parts") if isinstance(data, dict) else None + if parts and len(parts) and len(parts[0]): + maybe_year = parts[0][0] + if isinstance(maybe_year, int): + return maybe_year + return None diff --git a/citer/formatter.py b/citer/formatter.py new file mode 100644 index 0000000..8f2e94c --- /dev/null +++ b/citer/formatter.py @@ -0,0 +1,90 @@ +from typing import Iterable + +from .models import Author, WorkMetadata + + +def format_apa(metadata: WorkMetadata) -> str: + authors_text = format_authors(metadata.authors) + year_text = f"({metadata.year})." if metadata.year else "(n.d.)." + title_text = _sentence_case(metadata.title).rstrip(".") + "." + container_text = _format_container(metadata) + + parts = [authors_text, year_text, title_text] + if container_text: + parts.append(container_text) + + link = None + if metadata.doi: + link = f"https://doi.org/{metadata.doi}" + elif metadata.url: + link = metadata.url + if link: + parts.append(link) + + return " ".join(part.strip() for part in parts if part).strip() + + +def format_authors(authors: Iterable[Author]) -> str: + formatted = [format_author(author) for author in authors if format_author(author)] + if not formatted: + return "" + if len(formatted) == 1: + return formatted[0] + if len(formatted) == 2: + return f"{formatted[0]}, & {formatted[1]}" + return ", ".join(formatted[:-1]) + f", & {formatted[-1]}" + + +def format_author(author: Author) -> str: + given_initials = " ".join(_initial(part) for part in author.given.split() if part) + family = author.family.strip() + if family and given_initials: + return f"{family}, {given_initials}" + if family: + return family + return given_initials + + +def _initial(part: str) -> str: + clean = part.strip() + if not clean: + return "" + return f"{clean[0].upper()}." + + +def _sentence_case(text: str) -> str: + stripped = text.strip() + if not stripped: + return stripped + lower = stripped[0].upper() + stripped[1:] + return lower + + +def _format_container(metadata: WorkMetadata) -> str: + if not metadata.container_title and not metadata.volume and not metadata.pages: + return "" + + pieces = [] + if metadata.container_title: + pieces.append(metadata.container_title) + volume_issue = "" + if metadata.volume: + volume_issue = metadata.volume + if metadata.issue: + volume_issue += f"({metadata.issue})" + elif metadata.issue: + volume_issue = f"({metadata.issue})" + + if volume_issue: + pieces.append(volume_issue) + + if metadata.source == "arxiv" and metadata.identifier: + pieces.append(f"arXiv:{metadata.identifier}") + + if metadata.pages: + pieces.append(metadata.pages) + + container = ", ".join(pieces) + if container and not container.endswith("."): + container += "." + return container diff --git a/citer/identifiers.py b/citer/identifiers.py new file mode 100644 index 0000000..913861a --- /dev/null +++ b/citer/identifiers.py @@ -0,0 +1,50 @@ +import re +from urllib.parse import unquote + +from .exceptions import CitationError + + +ARXIV_ID = re.compile( + r"^(?P(?:\d{4}\.\d{4,5}|[a-z-]+\/\d{7}))(?:v\d+)?$", + flags=re.IGNORECASE, +) +DOI_PATTERN = re.compile(r"10\.\d{4,9}/\S+", flags=re.IGNORECASE) + + +def normalize_arxiv_identifier(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + url_match = re.search(r"arxiv\.org/(?:abs|pdf)/(?P[^?#/]+)", text, re.IGNORECASE) + if url_match: + text = url_match.group("id") + + text = re.sub(r"(?i)^arxiv:", "", text) + text = re.sub(r"(?i)\.pdf$", "", text) + text = text.split("?")[0] + text = text.strip() + + match = ARXIV_ID.match(text) + if not match: + raise CitationError(f"Could not understand arXiv identifier: {raw}") + + base_id = match.group("id") + base_id = re.sub(r"(?i)v\d+$", "", base_id) + return base_id + + +def normalize_doi(raw: str) -> str: + text = raw.strip() + text = unquote(text) + + text = re.sub(r"(?i)^https?://(dx\.)?doi\.org/", "", text) + text = re.sub(r"(?i)^doi:\s*", "", text) + text = text.split("?")[0] + text = text.strip() + + match = DOI_PATTERN.search(text) + if not match: + raise CitationError(f"Could not understand DOI: {raw}") + + doi = match.group(0) + return doi.lower() diff --git a/citer/models.py b/citer/models.py new file mode 100644 index 0000000..b910877 --- /dev/null +++ b/citer/models.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from typing import List, Optional + + +@dataclass +class Author: + given: str + family: str + + @property + def full_name(self) -> str: + return f"{self.given} {self.family}".strip() + + +@dataclass +class WorkMetadata: + title: str + authors: List[Author] + year: Optional[int] + container_title: Optional[str] = None + volume: Optional[str] = None + issue: Optional[str] = None + pages: Optional[str] = None + doi: Optional[str] = None + url: Optional[str] = None + source: str = "" + identifier: Optional[str] = None diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7dbfff9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "citer" +version = "0.1.0" +description = "CLI to create APA style citations from arXiv IDs or DOIs" +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "requests>=2.31.0", +] + +[project.scripts] +citer = "citer.cli:main" + +[tool.pytest.ini_options] +addopts = "-q" +testpaths = ["tests"] diff --git a/tests/__pycache__/test_formatter.cpython-313-pytest-8.3.5.pyc b/tests/__pycache__/test_formatter.cpython-313-pytest-8.3.5.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d628b94775e0a1f35c817e59e56ac7a2e5c3e39 GIT binary patch literal 2969 zcmd6p&2Jk;6u@WK-t|Wuf7EFb(n{T?klK~I{zzPCA<&dkRjv}GE&-$*tc^W!R;+ie z*>zg`)C2zlQja|$BK6QCe@3MUX*H_E0XS3~NSZ>Vo_I6sjh!eh2=%~7elzdQn>TOX z?*7L6TB3%ZgQXneyaZQZTw9X&5nd8zAe+Jq*)vdK5g z*wvf5t2YJ-@Gc251iyVxVA-sJ^qVHEF*|RXsD-M7yhp5IO$w|S_Bc501RoN0FxL;qkji>ia+htN0A9NCA};8>eQt3r@%kexnxWL5gu5>Q81 zeJ(&w!?T?1Ee3mNSyj9Y-d0$f&`z(0O9I!VwQR7klx@lt=+M5=Yw6@`H8ox#m>ZfaVmsq-~ z*c-|hu)v{Wxk82czO+!ieC7Rqwp>!nmHCAtzR$y}P21>S*mT{FbFEkeMYTz*c!CoR z!DN1nO0MCWm_~H|qB=C(!#bg&X08tM|0 z$`0h&z%*_&vE>>Y1}6O;9I2qx)Pgdn@FOur$yG{~QeIVFQw~)*e&T{suH@$P1%;nj zRsO}{wuM*X;^MZ@KlNU&(<|j2WpPnq!}+{JqYcB=UBkAvMH+%xCgxuyb;rTPrWkCDc&cY5r()uBe2dP6tOI>8;%u-hG-V48sG`g+$e;lVmoHf`{5 z=ycuLw7X`LyVhH#y{?;eONVO4>W;pFSp=F~;`>8dpAiDY$kqoS!S_3+ZutqzG}<0O zguo|+=os73&%83Klzqhyu`sG!ao9JZ@J1`eO}mYYx`R!lZEqDJ6Wb|5eK0AqV>(<8 zG6AF7>Cq{E{y}o^4nZkgqhVrFhPbR;&H;!A=vnHFm%R8aGxbn@B<*D89?L&$J-N4& z`{Zf*x;LKjrZ0J^te4Dq$`Gxt_M+)sk!gD&F}5Ncguu3sK>QJXj$Xz_Jr0=vi(u}6&HWi@y8}`H&~CdM7b1ZrA9a|J#OrJ)0-r6tn}4>1 zV_Z+_9B9#u-yJ3)pT;-*y7OE*PUXeTq>(2Xk{w#nSOj|%0~#hIZ28MRtJ({ z;xrR*RU$G4qTlxuC@4##Aim*)xC|gZIe%3-nqk4$uMM{!lK6EwlWCU43=@o2P$}{OIIuv@?;r-4nPZrVJeuCgTzY)#N%hBzJB1rh z)1Pw85Kd!z@S)`5gu43bdHrX<#0ZJjn~!2o=~Em3G2IHB7pymh`GtQ0ZNLt)Le* zqoObA8nrt;ThY`PTlKJ^9Viq`c7JVS#R&#k5FCejl4FWxwk4rE#$`L>0S<1-s}IBy&vnU-)7#^R|xPhkt~`64$! z7F>l>u~VbkyC=HCb#$5iqOS{3qc^jCqrBC5tEb>p^;XY-~8ykYngazZ(kj~u*hT(NiLZAahuwxxr!?VTL_ zJ^AmS?PU`0n67rDlFEgIk{nK_)>74|Cv6HZ1G(I$P*u*HE3c>HR`kmj6U)VrQ{vV+ z98+;C9{nno9VZewq&oC>M@nx(AUORwDM48MIIehl8Tb7cI}np5%y#|`f+)^qFEJab9Q%OQ7x4U zR4da(bG58e&F%nm;r1Q9RM8E~4wX#su=}{mI{Wts@ai(r7mfh z!S0%1)$67)NvTQgqlG2xsg;6uw_H*UwE~vSkS*a8_(YR_Vc0KR@Cz9(@N8LLA{Ua1 zg`F_QE;}u+n5&wsT3Sh8HP>XQ%CzJ~y`oVWr))9m>+Dvdyl(fg)$m$8Z~tp5)m5Wn z(HP{%XILB1%kPn$!E{ZWd@7xJXf&h@u#ZnNQkN#1V(Z=#Cu>s16*;rbqFiO&CxYpr z!7UA6DDaO@x&bZ&-aCU=YT`{OqBeM?AzcOj$t_0e(#@vWy0^reHR-A=a%P)_xyrgv z1QVp=riL#T_$RmA2$uo(ox$;%IK#>qZ%8@dYZEh^)}@)I*t)ksFXdd3GutfCRn~nX zm}-OL+|=;JB0u4!bz`52GrRDD$R1#ZuZ#h+uYq~s130rTEWsFl0fceEm(%P6Bt{kx z-mxq1LJ)>C0c&Pjd17Rl^po?%0Zc{MCQE@BIR)`(%LPx7B_SS|u0{;AGNc+qdND5A zA$|(&5LIKFuN;F6Q6;dJK0NVasywVb3P6ZdTeFPDh*!mHAF+jE+fGKnLbG06h zQs!zCbD#iPm*$#c>)rwlq7+X9{5Hn9y1FNU0n%|x!xxGC#GVw~2mE(>