import os
import re
import shutil
import time
from pathlib import Path

import pandas as pd
import requests_cache
from dotenv import load_dotenv
from IPython.display import FileLink, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession("front_pages")
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

# The id of the newspaper you want to harvest
TITLE_ID = "112"  # 112 is the AWW

# Range of years to harvest
START_YEAR = 1933
END_YEAR = 1983

# A prefix to use in file names, if None then the title_id will be used
PREFIX = "aww"

TITLE_URL = f"https://api.trove.nla.gov.au/v3/newspaper/title/{TITLE_ID}"


def get_current_year(years, year):
    """
    Get data for the current year from the dictionary of years.
    """
    for year_data in years:
        if year_data["date"] == str(year):
            return year_data


def get_issues():
    """
    Get all the issue details by looping through the range of years.
    Returns a list of issues.
    """
    params = {"encoding": "json", "include": "years"}
    headers = {"X-API-KEY": API_KEY}
    issues = []
    for year in tqdm(range(START_YEAR, END_YEAR), desc="Issues"):
        # Setting 'range' tells the API to give us a list of issue dates & urls within that date range
        date_range = f"{year}0101-{year}1231"
        params["range"] = date_range
        # Get the data
        response = s.get(TITLE_URL, params=params, headers=headers)
        data = response.json()
        # Extract the details for the current year
        year_data = get_current_year(data["year"], year)
        # Save issue details
        for issue in year_data["issue"]:
            issues.append(issue)
    return issues


def get_file_prefix():
    """
    Set the prefix to be used in filenames and data directory.
    Defaults to title id if prefix is not set
    """
    if PREFIX:
        file_prefix = PREFIX
    else:
        file_prefix = TITLE_ID
    return file_prefix


def create_output_dir(file_prefix):
    """
    Create output directory.
    """
    dir_path = Path("data", file_prefix)
    dir_path.mkdir(parents=True, exist_ok=True)
    return dir_path


def download_page(page_id, size, file_path):
    """
    Download page image using the supplied id.
    Size range is 1 to 7 (7 being the highest res)
    """
    # Format the page url ising the page id
    page_url = (
        f"http://trove.nla.gov.au/ndp/imageservice/nla.news-page{page_id}/level{size}"
    )
    # Download the image
    response = s.get(page_url)
    file_path.write_bytes(response.content)
    time.sleep(0.5)


def harvest_covers(size=5, sample_size=None):
    """
    Get a list of issues of the title.
    Loop through the issues downloading each front page/cover.
    Return issue metadata.
    """
    # Get a list of issues
    issues = get_issues()
    # Loop through the issues
    for issue in tqdm(issues[:sample_size], desc="Pages"):
        # Request the issue url
        response = s.get(issue["url"])
        # The issue url will be redirected to a page url
        # Extract the page id from the page url
        page_id = re.search(r"(\d+)$", response.url).group(1)
        # Save page id to metadata
        issue["page_id"] = page_id
        # Set up dirs and files
        file_prefix = get_file_prefix()
        dir_path = create_output_dir(file_prefix)
        file_path = Path(
            dir_path,
            f'{file_prefix}-{issue["date"].replace("-", "")}-page{page_id}.jpg',
        )
        # If the image hasn't already been downloaded, then download it!
        if not file_path.exists():
            download_page(page_id, size, file_path)
        # Save the image name to the metadata
        issue["image_name"] = file_path.name
    return issues

issues = harvest_covers()

df = pd.DataFrame(issues)
df.rename(columns={"id": "issue_id"}, inplace=True)
df.head()

file_prefix = get_file_prefix()
df.to_csv(f"data/{file_prefix}-issues.csv", index=False)
display(FileLink(f"data/{file_prefix}-issues.csv"))

# FOR TESTING ONLY -- IGNORE THIS CELL
if os.getenv("GW_STATUS") == "dev":
    PREFIX = "test"
    issues = harvest_covers(sample_size=5)
    assert len(list(Path("data", "test").glob("*.jpg"))) == 5
    shutil.rmtree(Path("data", "test"))

Harvesting Australian Women's Weekly covers¶

(or all the front pages of any digitised newspaper)¶

Harvest summary¶

Import what we need¶

Set some options¶

Define some functions¶

Run the harvest!¶

Save the metadata¶