import csv
import os
import time
from pathlib import Path

import pandas as pd
import requests_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv

# Insert your Trove API key between the quotes
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

api_url = "http://api.trove.nla.gov.au/v2/result"

# Set basic parameters
params = {
    "q": "has:tags",
    "include": "tags",
    "encoding": "json",
    "bulkHarvest": "true",
    "n": 100,
    "key": API_KEY,
}

# These types are needed to get data from API results
record_types = {
    "newspaper": "article",
    "gazette": "article",
    "book": "work",
    "article": "work",
    "picture": "work",
    "music": "work",
    "map": "work",
    "collection": "work",
    "list": "list",
}

def get_total(cparams):
    """
    This will enable us to make a nice progress bar...
    """
    response = s.get(api_url, params=cparams)
    data = response.json()
    return int(data["response"]["zone"][0]["records"]["total"])


def get_tags_from_record(record):
    """
    Extract tags from the supplied record.
    Returns a list of tags.
    Each tag is a list with two elements – value and date.
    """
    tags = []
    try:
        for tag in record["tag"]:
            tag_data = [tag.get("value"), tag.get("lastupdated")]
            tags.append(tag_data)
    except KeyError:
        pass
    return tags


def harvest_tags(zone):
    """
    Harvest public tags from the specified zone.
    Results are written to a CSV file.
    """
    print(zone)
    # article, work, or list
    record_type = record_types[zone]
    # Delete existing data file
    Path(f"tags_{zone}.csv").unlink(missing_ok=True)
    # Write column headings
    with Path(f"tags_{zone}.csv").open("a") as tag_file:
        writer = csv.writer(tag_file)
        writer.writerow(["tag", "date", "zone", "record_id"])
    start = "*"
    cparams = params.copy()
    cparams["zone"] = zone
    # If it's a work, get versions as well
    if record_type == "work":
        cparams["include"] = "tags,workversions"
    total = get_total(cparams)
    with tqdm(total=total) as pbar:
        while start is not None:
            cparams["s"] = start
            response = s.get(api_url, params=cparams)
            data = response.json()
            results = data["response"]["zone"][0]["records"]
            # Get token for next page
            try:
                start = results["nextStart"]
            # End of the result set
            except KeyError:
                start = None
            with Path(f"tags_{zone}.csv").open("a") as tag_file:
                writer = csv.writer(tag_file)
                for record in results[record_type]:
                    tags = []
                    tags += get_tags_from_record(record)
                    # If there are versions loop through them gathering tags
                    if "version" in record:
                        for version in record["version"]:
                            tags += get_tags_from_record(version)
                    # Remove duplicate tags on work
                    tags = [list(t) for t in {tuple(tl) for tl in tags}]
                    #
                    if len(tags) == 0:
                        print(record)
                    # Add zone and record_id, then write to CSV
                    for tag in tags:
                        tag.append(zone)
                        tag.append(record["id"])
                        writer.writerow(tag)
            pbar.update(len(results[record_type]))
            if not response.from_cache:
                time.sleep(0.2)

for zone in [
    "newspaper",
    "gazette",
    "book",
    "article",
    "picture",
    "music",
    "map",
    "collection",
    "list",
]:
    harvest_tags(zone)

dfs = []
for zone in [
    "newspaper",
    "gazette",
    "book",
    "article",
    "picture",
    "music",
    "map",
    "collection",
    "list",
]:
    dfs.append(pd.read_csv(f"tags_{zone}.csv"))
df = pd.concat(dfs)
df.head()

df.shape

(10403650, 4)

df["tag"].nunique()

2495958

df["tag_normalised"] = df["tag"].str.lower()

# Remove the unnormalised tag column
df.drop(columns="tag", inplace=True)
# Rename the lowercase tag column
df.rename(columns={"tag_normalised": "tag"}, inplace=True)

# Reorder columns and save as CSV
df[["tag", "date", "zone", "record_id"]].to_csv("trove_tags_20240606.csv", index=False)

	tag	date	zone	record_id
0	TCCC	2024-03-26T23:22:30Z	newspaper	1000000
1	TCCC	2024-03-26T23:32:50Z	newspaper	100000001
2	Stephen Guihen	2013-03-24T02:30:11Z	newspaper	100000011
3	test 22/6/23 @ 9:09am	2023-06-21T23:09:34Z	newspaper	100000068
4	HICKEN Aberaham - Barellan	2019-12-03T23:02:10Z	newspaper	100000071

Harvest public tags from Trove zones¶

Combine the tag files and convert to a dataframe¶

Normalise capitalisation and save as CSV¶