# Let's import the libraries we need.
import json
import os
import re
import time
from datetime import timedelta
from pathlib import Path

import arrow
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

def get_total(params):
    """
    Retrieve the total number of results.
    """
    response = s.get(
        "https://api.trove.nla.gov.au/v3/magazine/titles",
        params=params,
        headers={"X-API-KEY": API_KEY},
    )
    data = response.json()
    return data["total"]


def get_titles():
    """
    Save all records from the /magazine/titles endpoint.
    """
    # Need to use limit and offset for pagination
    params = {"encoding": "json", "limit": 100, "offset": 0}

    headers = {"X-API-KEY": API_KEY}

    titles = []
    more = True

    total = get_total(params)
    with tqdm(total=total) as pbar:
        while more:
            response = s.get(
                "https://api.trove.nla.gov.au/v3/magazine/titles",
                params=params,
                headers=headers,
            )
            data = response.json()
            if "magazine" in data:
                titles += data["magazine"]
                params["offset"] += 100
                pbar.update(len(data["magazine"]))
            else:
                more = False

    return titles

titles = get_titles()

df_titles = pd.DataFrame(titles)
df_titles.head()

df_titles.shape[0]

2504

df_titles.shape[0] - df_titles["id"].nunique()

311

df_titles.drop_duplicates(["id"], inplace=True)

df_titles.shape[0]

2193

dfpp = pd.read_csv(
    "https://raw.githubusercontent.com/GLAM-Workbench/trove-parliamentary-papers-data/main/trove-parliamentary-papers.csv",
    keep_default_na=False,
)

pids = list(
    dfpp.loc[dfpp["parent"] != ""]["parent"]
    .str.split("|")
    .explode()
    .reset_index()["parent"]
    .unique()
)
len(pids)

1654

df_notpp = df_titles.loc[~df_titles["id"].isin(pids)]

df_notpp.shape[0]

949

def get_periodical_issues(df, output_file="titles-issues.ndjson"):
    """
    Work through the list of titles harvesting details of available issues.
    """
    params = {"encoding": "json", "include": "years"}
    headers = {"X-API-KEY": API_KEY}

    # File in which the data will be saved
    output_ndjson = Path(output_file)
    output_ndjson.unlink(missing_ok=True)

    for title in tqdm(df.itertuples(), total=df.shape[0]):
        # For each title use the include=years parameter to get the number of issues per year.
        response = s.get(
            f"https://api.trove.nla.gov.au/v3/magazine/title/{title.id}",
            params=params,
            headers=headers,
        )
        data = response.json()
        years = []
        unknown_dates = 0
        issue_count = 0
        issues = []
        ranges = []
        start_year = None
        end_year = None
        # Loop through the years in which issues were published
        for year in data.get("year", []):
            # If the year is 'unknown' add to the unknown dates count
            if year["date"] == "unknown":
                unknown_dates += int(year["issuecount"])
            # Otherwise add to the list of years
            else:
                issue_count += int(year["issuecount"])
                years.append(int(year["date"]))
        # print(years)
        # If the list of years isn't empty, get the start and end of the date range
        if years:
            years_sorted = sorted(years)
            start_year = years_sorted[0]
            end_year = years_sorted[-1]
        # If we have dates and the number of issues is large, split up the date range into blocks of 10 years
        # These numbers are pretty arbitrary, it's just to avoid requesting details of thousands of issues in one hit
        if years and issue_count > 500:
            for r in range(start_year, end_year + 1, 10):
                ranges.append(f"{r}0101-{r+9}1231")
        # Otherwise just construct a single date range from the start and end dates
        else:
            ranges.append(f"{start_year}0101-{end_year}1231")
        # For each date range request a list of issues
        for date_range in ranges:
            issue_params = params.copy()
            issue_params["range"] = date_range
            issue_response = s.get(
                f"https://api.trove.nla.gov.au/v3/magazine/title/{title.id}",
                params=issue_params,
                headers=headers,
            )
            issue_data = issue_response.json()
            # Loop through the issue data saving it to issues
            for year in issue_data.get("year", []):
                issues += year.get("issue", [])
        # Update the dataset with the issue values
        data["start_year"] = start_year
        data["end_year"] = end_year
        data["issue_count"] = issue_count
        data["unknown_dates"] = unknown_dates
        data.pop("year", None)
        data["issues"] = issues
        # Write the updated data to an ndjson file
        with output_ndjson.open("a") as titles_file:
            titles_file.write(json.dumps(data) + "\n")

get_periodical_issues(df_notpp)

df_issues = pd.read_json("titles-issues.ndjson", lines=True)

df_issues["issue_count"].sum()

36554

df_issues["unknown_dates"].sum()

727

df_issues["issue_count"].sum() + df_issues["unknown_dates"].sum()

37281

df_issues.loc[
    (df_issues["issue_count"] == 0) & (df_issues["unknown_dates"] == 0)
].shape[0]

110

df_issues.loc[df_issues["unknown_dates"] != 0].shape[0]

123

def get_metadata(id):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    response = s.get(id)
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        work_data = "{}"
    if not response.from_cache:
        time.sleep(0.2)
    return json.loads(work_data)


def get_issues(parent_id):
    """
    Get the ids of issues that are children of the current record
    by scraping the 'Browse this collection' box in the digital collection viewer.
    See:
    """
    start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
    # The initial startIdx value
    start = 0
    # Number of results per page
    n = 20
    parts = []
    # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
    while n == 20:
        # Get the browse page
        response = s.get(start_url.format(parent_id, start))
        # Beautifulsoup turns the HTML into an easily navigable structure
        soup = BeautifulSoup(response.text, "lxml")
        # Find all the divs containing issue details and loop through them
        details = soup.find_all(class_="l-item-info")
        for detail in details:
            title = detail.find("h3")
            if title:
                issue_id = title.parent["href"].strip("/")
            else:
                issue_id = detail.find("a")["href"].strip("/")
            # Get the issue id
            parts.append(issue_id)
        if not response.from_cache:
            time.sleep(0.2)
        # Increment the startIdx
        start += n
        # Set n to the number of results on the current page
        n = len(details)
    return parts


def get_iso_date(date):
    """
    Try to convert a date string into an ISO formatted data
    """
    if date:
        iso_date = arrow.get(date, "ddd, D MMM YYYY").format("YYYY-MM-DD")
    else:
        iso_date = ""
    return iso_date


def add_issue(title, issue, metadata):
    """
    Create a record for an issue using metadata extracted from the digitised journal viewer.
    """
    iso_date = get_iso_date(metadata.get("issueDate", ""))
    title["issues"].append(
        {
            "id": issue,
            "date": iso_date,
            "description": metadata.get("subUnitNo", ""),
            "url": "https://nla.gov.au/" + issue,
        }
    )


def add_missing_issues(
    input_file="titles-issues.ndjson", output_file="titles-issues-added.ndjson"
):
    output_ndjson = Path(output_file)
    output_ndjson.unlink(missing_ok=True)

    total = sum(1 for _ in open(input_file))
    with Path(input_file).open("r") as ndjson_in:
        with output_ndjson.open("w") as ndjson_out:
            for line in tqdm(ndjson_in, total=total):
                title = json.loads(line)
                title_issues = [i["id"] for i in title["issues"]]
                # Get a list of issues scraped from the collection viewer
                issues = get_issues(title["id"])
                for issue in issues:
                    if issue not in title_issues:
                        # print(title["id"])
                        # print(issue_url)
                        # Get issue metadata from the digitised journal viewer
                        issue_metadata = get_metadata(issue)
                        # Construct a record for the missing issue and add it to the title data
                        add_issue(title, issue, issue_metadata)
                # Add a new issue count
                title["new_issue_count"] = len(title["issues"])
                ndjson_out.write(f"{json.dumps(title)}\n")

add_missing_issues()

df_added_issues = pd.read_json("titles-issues-added.ndjson", lines=True)

df_added_issues["new_issue_count"].sum() - df_added_issues["issue_count"].sum()

732

df_added_issues.loc[df_added_issues["new_issue_count"] == 0].shape[0]

108

# IGNORE THIS CELL -- TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
    get_periodical_issues(df_notpp[:10], output_file="title-issues-test.ndjson")
    add_missing_issues(
        input_file="title-issues-test.ndjson",
        output_file="titles-issues-added-test.ndjson",
    )

	id	title	publisher	place	troveUrl	startDate	endDate	issn
0	nla.obj-2526944948	... Annual report of the Canned Fruits Control...	Printed and published for the Government of th...	[Australia]	https://nla.gov.au/nla.obj-2526944948	1927-01-01	1937-06-30	NaN
1	nla.obj-244631375	... musical cabinet, no. 1-37 by W.H. Glen & C...	W.H. Glen & Co.	NaN	https://nla.gov.au/nla.obj-244631375	NaN	NaN	NaN
2	nla.obj-243252799	... musical magazine, No. 1-89 by Nicholson, [...	Nicholson & Co.; Nicholson & Ascherberg; some ...	NaN	https://nla.gov.au/nla.obj-243252799	NaN	NaN	NaN
3	nla.obj-1179844258	... Review	Australian Govt. Pub. Service	[Australia]	https://nla.gov.au/nla.obj-1179844258	1974-01-01	1994-06-30	1034-585X
4	nla.obj-8423556	"Coo-ee!" : the journal of the Bishops Knoll H...	Partridge & Love Ltd.	NaN	https://nla.gov.au/nla.obj-8423556	1916-01-01	1917-10-20	NaN

Get details of periodicals from the `/magazine/titles` API endpoint¶

Harvest a list of periodical titles from the `/magazine/titles` endpoint¶

Convert the results to a dataframe¶

Dealing with duplicates¶

Removing Parliamentary Papers¶

Getting a list of issues for each title¶

Find missing issues¶

Additional problems¶

Get details of periodicals from the /magazine/titles API endpoint¶

Harvest a list of periodical titles from the /magazine/titles endpoint¶

Convert the results to a dataframe¶

Dealing with duplicates¶

Removing Parliamentary Papers¶

Getting a list of issues for each title¶

Find missing issues¶

Additional problems¶

Get details of periodicals from the `/magazine/titles` API endpoint¶

Harvest a list of periodical titles from the `/magazine/titles` endpoint¶