Source code for repo_stats.citation_metrics

from datetime import UTC, datetime, timedelta
from pathlib import Path
from urllib.parse import urlencode

import numpy as np
import requests

from repo_stats.utilities import update_cache



[docs]
class ADSCitations:
    def __init__(self, token, cache_dir) -> None:
        """
        Class for getting, processing and aggregating citation data from the
        NASA ADS database for a given set of papers.

        Arguments
        ---------
        token : str
            Authorization token for ADS queries
        cache_dir : str, default=None
            Path to directory that will be populated with caches of citation data
        """
        self.token = token
        self.cache_dir = cache_dir


[docs]
    def get_citations(self, bib, metric):
        """
        Get citation data for a paper with the identifier 'bib' by querying the
        ADS API.

        Arguments
        ---------
        bib : str
            Bibcode identifier of the paper being cited, e.g., "2013A&A...558A..33A"
        metric : str
            Metrics to return for each citation to the paper, e.g. "bibcode, pubdate, pub, author, title"

        Returns
        -------
        all_cites : list of dict
            For each citation to the paper 'bib', a dictionary of 'metric' data
        """
        cache_file = f"{self.cache_dir}/{bib}.txt"
        if not Path(cache_file).exists():
            Path(cache_file).open("w").close()
        with Path(cache_file).open() as f:
            old_cites = f.readlines()
            print(f"  {len(old_cites)} citations found in ADS cache at {cache_file}")
        if old_cites is None:
            end, start = 1, 0
        else:
            end, start = len(old_cites) + 1, len(old_cites)
        new_cites = []
        while end > start:
            encoded_query = urlencode(
                {
                    "q": f"citations({bib})",
                    "fl": metric,
                    "rows": 100,
                    "start": start,
                }
            )
            response = requests.get(
                f"https://api.adsabs.harvard.edu/v1/search/query?{encoded_query}",
                headers={
                    "Authorization": "Bearer " + self.token,
                    "Content-type": "application/json",
                },
                timeout=180,
            )
            if response.status_code == 200:
                result = response.json()["response"]
                new_cites.extend(result["docs"])
                end, start = result["numFound"], result["start"] + len(result["docs"])
            else:
                msg = f"Query failed -- return code {response.status_code}"
                raise Exception(msg)
        return update_cache(cache_file, old_cites, new_cites)



[docs]
    def process_citations(self, citations):
        """
        Process (obtain statistics for) citation data in 'citations'.

        Arguments
        ---------
        citations : list of dict
            Dictionary of data for each citation to the reference paper

        Returns
        -------
        stats : dict
            Citation statistics:
                - 'cite_all': total number of citations
                - 'cite_year': citations in current year
                - 'cite_month': citations in previous month
                - 'cite_per_year': citations per year
                - 'cite_bibcodes': bibcodes of all citations
        """
        # [year, month] of each citation
        dates = [x["pubdate"][:7].split("-") for x in citations]
        dates = [[int(x[0]), int(x[1])] for x in dates]
        time_utc = datetime.now(UTC)
        cite_total = len(citations)
        cite_this_year = [x[0] for x in dates].count(time_utc.year)
        last_month = time_utc.replace(day=1) - timedelta(days=1)
        cite_last_month = dates.count([last_month.year, last_month.month])
        cite_year, cite_per_year = np.unique([x[0] for x in dates], return_counts=True)
        cite_bibcodes = [x["bibcode"] for x in citations]
        return {
            "cite_all": cite_total,
            "cite_year": cite_this_year,
            "cite_month": cite_last_month,
            "cite_per_year": [cite_year, cite_per_year],
            "cite_bibcodes": cite_bibcodes,
        }



[docs]
    def aggregate_citations(self, bibcode, metric="bibcode, pubdate, pub, author, title"):
        """
        Get, process and aggregate citation data in 'metric' for all papers in
        'bibcode'.

        Arguments
        ---------
        bibcode : str or list of str
            Bibcode identifier(s) of the paper(s) being cited, e.g., "2013A&A...558A..33A"
        metric : str, default="bibcode, pubdate, pub, author, title"
            Metrics to return for each citation

        Returns
        -------
        all_stats : dict
            Individual and aggregated citation statistics across all papers in 'bibcode'
        """
        all_citations, all_stats = [], {}
        for _ii, bb in enumerate(bibcode):
            print(f"\nCollecting and processing citations for paper {_ii + 1} of {len(bibcode)}: {bb}")
            citations = self.get_citations(bb, metric)
            all_citations.extend(citations)

            stats = self.process_citations(citations)
            all_stats[bb] = stats
        print("\nAggregating citations for all papers")
        # remove duplicates of papers that cite multiple references in 'bibcode'
        all_citations_unique = [x for i, x in enumerate(all_citations) if x not in all_citations[i + 1 :]]
        all_stats["aggregate"] = self.process_citations(all_citations_unique)
        print(
            f"  {len(all_citations_unique)} unique of {len(all_citations)} total citations - returning only unique citations"
        )
        return all_stats