import datetime
import os
import warnings
from json import JSONDecodeError
from operator import itemgetter

warnings.simplefilter(action="ignore", category=FutureWarning)

import altair as alt
import nltk
import pandas as pd
import requests_cache
from dotenv import load_dotenv
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from textblob import TextBlob
from tqdm.auto import tqdm
from wordcloud import WordCloud

nltk.download("stopwords")
nltk.download("punkt")

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

# Insert your Trove API key between the quotes
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

api_url = "https://api.trove.nla.gov.au/v3/result"

params = {
    "category": "list",
    "encoding": "json",
    "n": 100,
    "s": "*",
    "reclevel": "full",
    "bulkHarvest": "true",
}

headers = {"X-API-KEY": API_KEY}

def get_total():
    """
    This will enable us to make a nice progress bar...
    """
    response = s.get(api_url, params=params, headers=headers)
    data = response.json()
    return int(data["category"][0]["records"]["total"])

lists = []
total = get_total()
with tqdm(total=total) as pbar:
    while params["s"]:
        response = s.get(api_url, params=params, headers=headers)
        try:
            data = response.json()
        except JSONDecodeError:
            print(response.text)
            print(response.url)
            raise
        else:
            records = data["category"][0]["records"]
            try:
                params["s"] = records["nextStart"]
            except KeyError:
                params["s"] = None
            for record in records["list"]:
                try:
                    lists.append(
                        {
                            "id": record["id"],
                            "title": record.get("title", ""),
                            "number_items": record["listItemCount"],
                            "created": record["date"]["created"],
                            "updated": record["date"]["lastupdated"],
                        }
                    )
                except TypeError:
                    print(record)
            pbar.update(100)

  0%|          | 0/111965 [00:00<?, ?it/s]

None
None
None
None
None

# Load past file for testing if in dev
if os.getenv("GW_STATUS") and os.getenv("GW_STATUS") == "dev":
    df = pd.read_csv("data/trove-lists-2024-05-29.csv")
# Otherwise load current harvested data
else:
    df = pd.DataFrame(lists)
    df.head()

df.describe()

csv_file = "data/trove-lists-{}.csv".format(datetime.datetime.now().isoformat()[:10])
df.to_csv(csv_file, index=False)
HTML('<a target="_blank" href="{}">Download CSV</a>'.format(csv_file))

total_items = df["number_items"].sum()
print("There are {:,} items in {:,} lists.".format(total_items, df.shape[0]))

There are 2,145,538 items in 111,960 lists.

biggest = df.iloc[df["number_items"].idxmax()]
biggest

id                                  71461
title           Victoria and elsewhere...
number_items                        10351
created              2015-04-03T11:50:51Z
updated              2016-02-22T04:27:12Z
Name: 91223, dtype: object

display(
    HTML(
        'The biggest list is <a target="_blank" href="https://trove.nla.gov.au/list?id={}">{}</a> with {:,} items.'.format(
            biggest["id"], biggest["title"], biggest["number_items"]
        )
    )
)

# This makes it possible to include more than 5000 records
# alt.data_transformers.enable('json', urlpath='files')
alt.data_transformers.disable_max_rows()
alt.Chart(df[["created"]]).mark_line().encode(
    x="yearmonth(created):T",
    y="count()",
    tooltip=[
        alt.Tooltip("yearmonth(created):T", title="Month"),
        alt.Tooltip("count()", title="Lists"),
    ],
).properties(width=600)

titles = df["title"].str.lower().str.cat(sep=" ")

# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800).generate(titles)
wordcloud.to_image()

blob = TextBlob(titles)
stopwords = nltk.corpus.stopwords.words("english")
word_counts = [
    [word, count]
    for word, count in blob.lower().word_counts.items()
    if word not in stopwords
]
word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
pd.DataFrame(word_counts).style.format({1: "{:,}"}).bar(
    subset=[1], color="#d65f5f"
).set_properties(subset=[1], **{"width": "300px"})

ngrams = [" ".join(ngram).lower() for ngram in blob.lower().ngrams(2)]
ngram_counts = (
    pd.DataFrame(ngrams)[0]
    .value_counts()
    .rename_axis("ngram")
    .reset_index(name="count")
)
display(
    ngram_counts[:25]
    .style.format({"count": "{:,}"})
    .bar(subset=["count"], color="#d65f5f")
    .set_properties(subset=["count"], **{"width": "300px"})
)

ngrams = [" ".join(ngram).lower() for ngram in blob.lower().ngrams(3)]
ngram_counts = (
    pd.DataFrame(ngrams)[0]
    .value_counts()
    .rename_axis("ngram")
    .reset_index(name="count")
)
display(
    ngram_counts[:25]
    .style.format({"count": "{:,}"})
    .bar(subset=["count"], color="#d65f5f")
    .set_properties(subset=["count"], **{"width": "300px"})
)

	id	number_items
count	111960.000000	111960.000000
mean	89844.496008	19.163433
std	50142.898174	83.319781
min	51.000000	0.000000
25%	47114.500000	1.000000
50%	90193.500000	4.000000
75%	132493.750000	13.000000
max	179448.000000	10351.000000

	0	1
0	family	7,377
1	list	4,358
2	ww1	4,333
3	soldier	4,303
4	articles	4,214
5	trove	3,962
6	john	2,918
7	william	2,723
8	history	2,419
9	james	1,962
10	george	1,639
11	thomas	1,586
12	henry	1,397
13	australian	1,198
14	australia	1,161
15	charles	1,127
16	mary	1,038
17	nsw	894
18	edward	892
19	nee	867
20	ww2	840
21	robert	834
22	joseph	780
23	nt	765
24	arthur	764

	ngram	count
0	ww1 soldier	3,958
1	list of	3,886
2	of articles	3,858
3	soldier list	3,847
4	in trove	3,756
5	articles in	3,737
6	family history	1,103
7	nt ww2	725
8	family tree	367
9	of the	357
10	in australia	319
11	in the	318
12	wwi soldier	271
13	family of	255
14	south australia	232
15	william ww1	221
16	port lincoln	209
17	henry ww1	194
18	john ww1	182
19	and the	177
20	maroochydore slsc	175
21	world war	171
22	james ww1	161
23	mary ann	160
24	motor boat	153

	ngram	count
0	list of articles	3,847
1	soldier list of	3,840
2	articles in trove	3,728
3	of articles in	3,721
4	ww1 soldier list	3,563
5	wwi soldier list	266
6	william ww1 soldier	219
7	henry ww1 soldier	191
8	john ww1 soldier	180
9	james ww1 soldier	160
10	george ww1 soldier	150
11	charles ww1 soldier	133
12	joseph ww1 soldier	124
13	edward ww1 soldier	123
14	of articles on	118
15	articles on trove	117
16	thomas ww1 soldier	115
17	australian gymnastics research	109
18	andrews of albury	106
19	cocker spaniel affix	105
20	arthur andrews of	100
21	dr arthur andrews	100
22	ww1 trophy guns	92
23	music resources theme	79
24	robert ww1 soldier	73

Harvest summary data from Trove lists¶

Setting up...¶

Add your Trove API key¶

Set some parameters¶

Harvest the data¶

Inspect the results¶

Save the harvested data as a CSV file¶

How many items are in lists?¶

What is the biggest list?¶

When were they created?¶

What words are used in the titles?¶

Word frequency¶

Bigram frequency¶

Trigram frequency¶