import json
import os
import time
from pathlib import Path

import arrow
import pandas as pd
import requests
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

API_URL = "https://api.trove.nla.gov.au/v3/newspaper/title/"

PARAMS = {"encoding": "json"}

HEADERS = {"X-API-KEY": API_KEY}

# THIS CODE COMES FROM harvest_newspaper_issues.ipynb

# These are newspapers where the date ranges are off by more than a year
# In these cases we'll harvest all the issues in one hit, rather than year by year
dodgy_dates = ["1486", "1618", "586"]


def get_title_summary(title_id):
    """
    Get the details of a single newspaper title.
    """
    response = s.get(f"{API_URL}{title_id}", params=PARAMS, headers=HEADERS)
    data = response.json()
    return data


def get_issues_in_range(title_id, start_date, end_date):
    """
    Get a list of issues available from a particular newspaper within the given date range.
    """
    issues = []
    params = PARAMS.copy()
    params["include"] = "years"
    params["range"] = f'{start_date.format("YYYYMMDD")}-{end_date.format("YYYYMMDD")}'
    response = s.get(f"{API_URL}{title_id}", params=params, headers=HEADERS)
    try:
        data = response.json()
    except json.JSONDecodeError:
        print(response.url)
        print(response.text)
    else:
        for year in data["year"]:
            if "issue" in year:
                for issue in year["issue"]:
                    issues.append(
                        {
                            "title_id": title_id,
                            "issue_id": issue["id"],
                            "issue_date": issue["date"],
                        }
                    )
    return issues


def get_issues_full_range(title_id):
    """
    In most cases we set date ranges to get issue data in friendly chunks. But sometimes the date ranges are missing or wrong.
    In these cases, we ask for everything at once, by setting the range to the limits of Trove.
    """
    start_date = arrow.get("1803-01-01")
    range_end = arrow.now()
    issues = get_issues_in_range(title_id, start_date, range_end)
    return issues


def get_issues_from_title(title_id):
    """
    Get a list of all the issues available for a particular newspaper.

    Params:
      * title_id - a newspaper identifier
    Returns:
      * A list containing details of available issues
    """
    issues = []
    title_summary = get_title_summary(title_id)

    # Date range is off by more than a year, so get everything in one hit
    if title_id in dodgy_dates:
        issues += get_issues_full_range(title_id)
    else:
        try:
            # The date ranges are not always reliable, so to make sure we get everything
            # we'll set the range to the beginning and end of the given year
            start_date = arrow.get(title_summary["startDate"]).replace(day=1, month=1)
            end_date = arrow.get(title_summary["endDate"]).replace(day=31, month=12)
        except KeyError:
            # Some records have no start and end dates at all
            # In this case set the range to the full range of Trove's newspapers
            issues += get_issues_full_range(title_id)
        else:
            # If the date range is available, loop through it by year
            while start_date <= end_date:
                range_end = start_date.replace(month=12, day=31)
                issues += get_issues_in_range(title_id, start_date, range_end)
                start_date = start_date.shift(years=+1).replace(month=1, day=1)
    return issues

# Set the id of the newspaper you want to harvest from
# You can get the newspaper id from the title details page in Trove
trove_newspaper_id = 1646

# Harvest the issue data
issues = get_issues_from_title(trove_newspaper_id)

df = pd.DataFrame(issues)
df.head()

df.shape[0]

df["issue_date"].min()

df["issue_date"].max()

# THIS CODE IS A SLIGHTLY MODIFIED VERSION OF WHAT'S IN THE TROVE NEWSPAPER HARVESTER


def ping_pdf(ping_url):
    """
    Check to see if a PDF is ready for download.
    If a 200 status code is received, return True.
    """
    ready = False
    # req = Request(ping_url)
    try:
        response = s.get(ping_url, timeout=30)
        response.raise_for_status()
    except HTTPError:
        if response.status_code == 423:
            ready = False
        else:
            raise
    else:
        ready = True
    return ready


def get_pdf_url(issue_id):
    """
    Download the PDF version of an article.
    These can take a while to generate, so we need to ping the server to see if it's ready before we download.
    """
    pdf_url = None
    # Ask for the PDF to be created
    prep_url = (
        f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}/prep"
    )
    response = s.get(prep_url)
    # Get the hash
    prep_id = response.text
    # Url to check if the PDF is ready
    ping_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}.ping?followup={prep_id}"
    tries = 0
    ready = False
    time.sleep(2)  # Give some time to generate pdf
    # Are you ready yet?
    while ready is False and tries < 5:
        ready = ping_pdf(ping_url)
        if not ready:
            tries += 1
            time.sleep(2)
    # Download if ready
    if ready:
        pdf_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}.pdf?followup={prep_id}"
    return pdf_url


def harvest_pdfs(issues, start_date=None, end_date=None):
    """
    Download all issue pdfs within the given date range.
    """
    output_path = Path("data", "issues")
    output_path.mkdir(parents=True, exist_ok=True)
    df = pd.DataFrame(issues)
    if start_date and end_date:
        df_range = df.loc[
            (df["issue_date"] >= start_date) & (df["issue_date"] <= end_date)
        ]
    elif start_date:
        df_range = df.loc[(df["issue_date"] >= start_date)]
    elif end_date:
        df_range = df.loc[(df["issue_date"] < end_date)]
    else:
        df_range = df
    for issue in tqdm(df_range.itertuples(), total=df_range.shape[0]):
        pdf_url = get_pdf_url(issue.issue_id)
        response = s.get(pdf_url)
        Path(
            output_path,
            f'{issue.title_id}-{issue.issue_date.replace("-", "")}-{issue.issue_id}.pdf',
        ).write_bytes(response.content)

# Set start and end dates - YYYY-MM-DD, eg:
# start_date = '1932-05-01'
# Adjust these to suit your case, set to None to get everything
start_date = None
end_date = None

# Start harvesting the PDFs!
harvest_pdfs(issues, start_date=start_date, end_date=end_date)

Harvest the issues of a newspaper as PDFs¶

Set up what we need¶

Get information about available issues¶

Harvest the issues as PDFs¶