import json
import os
from datetime import timedelta

import altair as alt
import arrow
import pandas as pd
import requests_cache
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

# Create a session that will automatically retry on server errors
s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

API_URL = "https://api.trove.nla.gov.au/v3/newspaper/title/"

PARAMS = {"encoding": "json"}

HEADERS = {"X-API-KEY": API_KEY}

def get_issues_by_year():
    """
    Gets the total number of issues per year for each newspaper.

    Returns:
      * A list of dicts, each containing the number of issues available from a newspaper in a particular year
    """
    years = []

    # First we get a list of all the newspapers (and gazettes) in Trove
    response = s.get(
        "https://api.trove.nla.gov.au/v3/newspaper/titles",
        params=PARAMS,
        headers=HEADERS,
    )
    data = response.json()
    titles = data["newspaper"]

    # Then we loop through all the newspapers to retrieve issue data
    for title in tqdm(titles):
        params = PARAMS.copy()

        # This parameter adds the number of issues per year to the newspaper data
        params["include"] = "years"
        response = s.get(f'{API_URL}{title["id"]}', params=params, headers=HEADERS)
        try:
            data = response.json()
        except json.JSONDecodeError:
            print(response.url)
            print(response.text)
        else:
            # Loop through all the years, saving the totals
            for year in data["year"]:
                years.append(
                    {
                        "title": title["title"],
                        "title_id": title["id"],
                        "state": title["state"],
                        "year": year["date"],
                        "issues": int(year["issuecount"]),
                    }
                )
    return years

issue_totals = get_issues_by_year()

  0%|          | 0/1812 [00:00<?, ?it/s]

# Save results as a dataframe
df_totals = pd.DataFrame(issue_totals)
df_totals.head()

df_totals["issues"].sum()

np.int64(2739444)

df_totals.shape

(29381, 5)

# Save as a CSV file
df_totals.to_csv(
    f'newspaper_issues_totals_by_year_{arrow.now().format("YYYYMMDD")}.csv', index=False
)

# Group by year and calculate sum of totals
df_years = df_totals.groupby(by="year").sum().reset_index()

# Create a chart
alt.Chart(df_years).mark_bar(size=2).encode(
    x=alt.X("year:Q", axis=alt.Axis(format="c")),
    y="issues:Q",
    tooltip=["year:O", "issues:Q"],
).properties(width=800)

# These are newspapers where the date ranges are off by more than a year
# In these cases we'll harvest all the issues in one hit, rather than year by year
dodgy_dates = ["1486", "1618", "586"]


def get_title_summary(title_id):
    """
    Get the details of a single newspaper title.
    """
    response = s.get(f"{API_URL}{title_id}", params=PARAMS, headers=HEADERS)
    data = response.json()
    return data


def get_issues_in_range(title_id, start_date, end_date):
    """
    Get a list of issues available from a particular newspaper within the given date range.
    """
    issues = []
    params = PARAMS.copy()
    params["include"] = "years"
    params["range"] = f'{start_date.format("YYYYMMDD")}-{end_date.format("YYYYMMDD")}'
    response = s.get(f"{API_URL}{title_id}", params=params, headers=HEADERS)
    try:
        data = response.json()
    except json.JSONDecodeError:
        print(response.url)
        print(response.text)
    else:
        for year in data["year"]:
            if "issue" in year:
                for issue in year["issue"]:
                    issues.append(
                        {
                            "title_id": title_id,
                            "issue_id": issue["id"],
                            "issue_date": issue["date"],
                        }
                    )
    return issues


def get_issues_full_range(title_id):
    """
    In most cases we set date ranges to get issue data in friendly chunks. But sometimes the date ranges are missing or wrong.
    In these cases, we ask for everything at once, by setting the range to the limits of Trove.
    """
    start_date = arrow.get("1803-01-01")
    range_end = arrow.now()
    issues = get_issues_in_range(title_id, start_date, range_end)
    return issues


def get_issues_from_title(title_id):
    """
    Get a list of all the issues available for a particular newspaper.

    Params:
      * title_id - a newspaper identifier
    Returns:
      * A list containing details of available issues
    """
    issues = []
    title_summary = get_title_summary(title_id)

    # Date range is off by more than a year, so get everything in one hit
    if title_id in dodgy_dates:
        issues += get_issues_full_range(title_id)
    else:
        try:
            # The date ranges are not always reliable, so to make sure we get everything
            # we'll set the range to the beginning and end of the given year
            start_date = arrow.get(title_summary["startDate"]).replace(day=1, month=1)
            end_date = arrow.get(title_summary["endDate"]).replace(day=31, month=12)
        except KeyError:
            # Some records have no start and end dates at all
            # In this case set the range to the full range of Trove's newspapers
            issues += get_issues_full_range(title_id)
        else:
            # If the date range is available, loop through it by year
            while start_date <= end_date:
                range_end = start_date.replace(month=12, day=31)
                issues += get_issues_in_range(title_id, start_date, range_end)
                start_date = start_date.shift(years=+1).replace(month=1, day=1)
    return issues


def get_all_issues():
    issues = []
    response = s.get(
        "https://api.trove.nla.gov.au/v3/newspaper/titles",
        params=PARAMS,
        headers=HEADERS,
    )
    data = response.json()
    titles = data["newspaper"]
    for title in tqdm(titles):
        title_issues = get_issues_from_title(title["id"])
        issues += [
            dict(i, title=title["title"], state=title["state"]) for i in title_issues
        ]
    return issues

issues = get_all_issues()

len(issues)

2739444

df_issues = pd.DataFrame(issues)
df_issues.head()

df_issues.to_csv(f'newspaper_issues_{arrow.now().format("YYYYMMDD")}.csv', index=False)

df_issues.to_parquet(
    f'newspaper_issues_{arrow.now().format("YYYYMMDD")}.parquet', index=False
)

# Compare the total number of issues reported by the API with the number actually harvested
# This helps us identify cases where the harvest has failed for some reason.
missing = 0
for title, years in df_totals.groupby(by=["title_id", "title"]):
    num_issues = df_issues.loc[df_issues["title_id"] == title[0]].shape[0]
    if years["issues"].sum() != num_issues:
        print(title[0], title[1])
        print(f'Year totals: {years["issues"].sum()}')
        print(f"Issues harvested: {num_issues}")
        missing += years["issues"].sum() - num_issues

	title	title_id	state	year	issues
0	Canberra Community News (ACT : 1925 - 1927)	166	ACT	1925	3
1	Canberra Community News (ACT : 1925 - 1927)	166	ACT	1926	12
2	Canberra Community News (ACT : 1925 - 1927)	166	ACT	1927	9
3	Canberra Illustrated: A Quarterly Magazine (AC...	165	ACT	1925	1
4	Federal Capital Pioneer (Canberra, ACT : 1924 ...	69	ACT	1924	1

	title_id	issue_id	issue_date	title	state
0	166	495445	1925-10-14	Canberra Community News (ACT : 1925 - 1927)	ACT
1	166	495422	1925-11-11	Canberra Community News (ACT : 1925 - 1927)	ACT
2	166	495423	1925-12-11	Canberra Community News (ACT : 1925 - 1927)	ACT
3	166	495424	1926-01-11	Canberra Community News (ACT : 1925 - 1927)	ACT
4	166	495425	1926-02-11	Canberra Community News (ACT : 1925 - 1927)	ACT

Harvest information about newspaper issues¶

Issue urls¶

Total number of issues per year for every newspaper in Trove¶

Display the total number of issues per year¶

Harvest a complete list of issues¶

Check to see what's missing¶