import os
from operator import itemgetter  # used for sorting

import altair as alt
import pandas as pd  # makes manipulating the data easier
import requests
from dotenv import load_dotenv
from IPython.display import clear_output
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

# Make sure data directory exists
os.makedirs("docs", exist_ok=True)

# Create a session that will automatically retry on server errors
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

# Basic parameters to send to the Trove API, we'll add more later.
params = {
    "category": "newspaper",
    "l-artType": "newspaper",
    "encoding": "json",
    "n": 0,  # We don't need any records, just the facets!
}

def get_results(params):
    """
    Get JSON response data from the Trove API.
    Parameters:
        params
    Returns:
        JSON formatted response data from Trove API
    """
    response = s.get(
        "https://api.trove.nla.gov.au/v3/result",
        params=params,
        headers={"X-API-KEY": API_KEY},
        timeout=30,
    )
    response.raise_for_status()
    # print(response.url) # This shows us the url that's sent to the API
    data = response.json()
    return data

# Get the JSON data from the Trove API using our parameters
data = get_results(params)

# Navigate down the JSON hierarchy to find the total results
total = int(data["category"][0]["records"]["total"])

# Print the results
print("There are currently {:,} articles in Trove!".format(total))

There are currently 249,805,769 articles in Trove!

def get_facets(data):
    """
    Loop through facets in Trove API response, saving terms and counts.
    Parameters:
        data  - JSON formatted response data from Trove API
    Returns:
        A list of dictionaries containing: 'term', 'total_results'
    """
    facets = []
    try:
        # The facets are buried a fair way down in the results
        # Note that if you ask for more than one facet, you'll have use the facet['name'] param to find the one you want
        # In this case there's only one facet, so we can just grab the list of terms (which are in fact the results by year)
        for term in data["category"][0]["facets"]["facet"][0]["term"]:

            # Get the year and the number of results, and convert them to integers, before adding to our results
            facets.append({"term": term["search"], "total_results": int(term["count"])})

        # Sort facets by year
        facets.sort(key=itemgetter("term"))
    except (TypeError, KeyError):
        pass
    return facets


def get_facet_data(params, start_decade=180, end_decade=202):
    """
    Loop throught the decades from 'start_decade' to 'end_decade',
    getting the number of search results for each year from the year facet.
    Combine all the results into a single list.
    Parameters:
        params - parameters to send to the API
        start_decade
        end_decade
    Returns:
        A list of dictionaries containing 'year', 'total_results' for the complete
        period between the start and end decades.
    """
    # Create a list to hold the facets data
    facet_data = []

    # Loop through the decades
    for decade in tqdm(range(start_decade, end_decade + 1)):

        # Avoid confusion by copying the params before we change anything.
        search_params = params.copy()

        # Add decade value to params
        search_params["l-decade"] = decade

        # Get the data from the API
        data = get_results(search_params)

        # Get the facets from the data and add to facets_data
        facet_data += get_facets(data)

    # Reomve the progress bar (you can also set leave=False in tqdm, but that still leaves white space in Jupyter Lab)
    clear_output()
    return facet_data

params["facet"] = "year"

total_facets = get_facet_data(params)

df_total = pd.DataFrame(total_facets)
df_total.head()

# Feed Altair our dataframe
alt.Chart(df_total).mark_line(point=True).encode(
    # Years along the X axis
    x=alt.X("term:Q", axis=alt.Axis(format="c", title="Year")),
    # Number of articles on the Y axis (formatted with thousands separators)
    y=alt.Y("total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")),
    # Use tooltips to display the year and number of articles when you hover over a point
    tooltip=[
        alt.Tooltip("term:Q", title="Year"),
        alt.Tooltip("total_results:Q", title="Articles", format=","),
    ],
).properties(width=700, height=400)

params["facet"] = "state"

# Get the data from the API
data = get_results(params)
facets = []

# Loop through the facet terms (each term will be a state)
for term in data["category"][0]["facets"]["facet"][0]["term"]:
    # Get the state and the number of results, and convert it to integers, before adding to our results
    facets.append({"state": term["search"], "total_results": int(term["count"])})

# Convert to a dataframe
df_states = pd.DataFrame(facets)
df_states

# Chart the results
alt.Chart(df_states).mark_bar().encode(
    x=alt.X("state:N", title="State"),
    y=alt.Y("total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")),
    tooltip=[
        alt.Tooltip("state", title="State"),
        alt.Tooltip("total_results:Q", title="Articles", format=","),
    ],
).properties(width=700, height=400)

params["facet"] = "year"

states = df_states["state"].to_list()
states

['New South Wales',
 'Victoria',
 'Queensland',
 'South Australia',
 'Western Australia',
 'Tasmania',
 'ACT',
 'International',
 'National',
 'Northern Territory']

def get_state_facets(params, states):
    """
    Loop through the supplied list of states searching getting the year by year results.
    Parameters:
        params - basic parameters to send to the API
        states - a list of states to apply using the state facet
    Returns:
        A dataframe
    """
    dfs = []
    these_params = params.copy()

    # Loop through the supplied list of states
    for state in states:

        # Set the state facet to the current state value
        these_params["l-state"] = state

        # Get year facets for this state & query
        facet_data = get_facet_data(these_params)

        # Convert the results to a dataframe
        df = pd.DataFrame(facet_data)

        # Add a state column to the dataframe and set its value to the current state
        df["state"] = state

        # Add this df to the list of dfs
        dfs.append(df)

    # Concatenate all the dataframes and return the result
    return pd.concat(dfs)

# GET ALL THE DATA!
df_states_years = get_state_facets(params, states)

# Make a chart
alt.Chart(df_states_years).mark_area().encode(
    # Show years on the X axis
    x=alt.X("term:Q", axis=alt.Axis(format="c", title="Year")),
    # Show the stacked number of articles on the Y axis
    y=alt.Y(
        "total_results:Q",
        axis=alt.Axis(format=",d", title="Number of articles"),
        stack=True,
    ),
    # Use color to distinguish the states
    color="state:N",
    # And show the state / year / and total details on hover
    tooltip=[
        "state",
        alt.Tooltip("term:Q", title="Year"),
        alt.Tooltip("total_results:Q", title="Articles", format=","),
    ],
).properties(width=700, height=400)

# A new chat that puts states in separate facets
alt.Chart(df_states_years).mark_area().encode(
    # Year of the X axis
    x=alt.X("term:Q", axis=alt.Axis(format="c", title="Year")),
    # Number of articles on the Y axis
    y=alt.Y("total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")),
    # Split the data up into sub-charts based on state
    facet=alt.Facet("state:N", columns=3),
    # Give each state a different color
    color="state:N",
    # Details on hover
    tooltip=[
        "state",
        alt.Tooltip("term:Q", title="Year"),
        alt.Tooltip("total_results:Q", title="Articles", format=","),
    ],
    # Note the columns value to set the number of sub-charts in each row
).properties(width=200, height=150)

# Chart of Victoria only
# Filter the dataframe to just show Victoria
alt.Chart(
    df_states_years.loc[df_states_years["state"] == "Victoria"]
).mark_area().encode(
    # Years on the X axis
    x=alt.X("term:Q", axis=alt.Axis(format="c", title="Year")),
    # Number of articles on the Y axis
    y=alt.Y(
        "total_results:Q",
        axis=alt.Axis(format=",d", title="Number of articles"),
        stack=True,
    ),
    # Try to match the color in the chart above
    color=alt.value("#8c564b"),
    # Details on hover
    tooltip=[
        alt.Tooltip("term:Q", title="Year"),
        alt.Tooltip("total_results:Q", title="Articles", format=","),
    ],
).properties(
    width=700, height=400
)

# Use color as the selector for filtering the chart
selection = alt.selection_point(encodings=["color"])

# Color is based on the state, or gray if another state is selected
color = alt.condition(
    selection, alt.Color("state:N", legend=None), alt.value("lightgray")
)

# A basic area chart, starts stacked, but when filtered shows only the active state
area = (
    alt.Chart(df_states_years)
    .mark_area()
    .encode(
        # Years on the X axis
        x=alt.X("term:Q", axis=alt.Axis(format="c", title="Year")),
        # Number of articles on the Y axis
        y=alt.Y(
            "total_results:Q",
            axis=alt.Axis(format=",d", title="Number of articles"),
            stack=True,
        ),
        # Color uses the settings defined above
        color=color,
        # Details on hover
        tooltip=[
            "state",
            alt.Tooltip("term:Q", title="Year"),
            alt.Tooltip("total_results:Q", title="Articles", format=","),
        ],
    )
    .properties(width=700, height=400)
    .transform_filter(
        # Filter data by state when a state is selected
        selection
    )
    .add_params(selection)
)

# Add a bar chart showing the number of articles per state
bar = (
    alt.Chart(df_states)
    .mark_bar()
    .encode(
        # State on the X axis
        x=alt.X("state:N", title="State"),
        # Number of articles on the Y axis
        y=alt.Y(
            "total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")
        ),
        # Details on hover
        tooltip=[
            alt.Tooltip("state", title="State"),
            alt.Tooltip("total_results:Q", title="Articles", format=","),
        ],
        # Color based on state as defined above
        color=color,
    )
    .properties(width=700, height=150)
    .add_params(
        # Highlight state when selected
        selection
    )
)

# For good measure we'll add an interactive legend (which is really just a mini chart)
# This makes it easier to select states that don't have many articles
legend = (
    alt.Chart(df_states)
    .mark_rect()
    .encode(
        # Show the states
        y=alt.Y("state:N", axis=alt.Axis(orient="right", title=None)),
        # Color as above
        color=color,
    )
    .add_params(
        # Highlight on selection
        selection
    )
)

# Concatenate the charts -- area & legend hotizontal, then bar added vertically
(area | legend) & bar

	term	total_results
0	1803	526
1	1804	619
2	1805	430
3	1806	367
4	1807	134

Visualise the total number of newspaper articles in Trove by year and state¶

1. Setting things up¶

Import what we need¶

Enter a Trove API key¶

Set some default parameters¶

Define some functions¶

2. Show the total number of articles per year¶

3. Show the number of newspaper articles by state¶

4. Show the number of articles by state and year¶

5. Combine everything and make it interactive!¶

6. Further reading¶

	state	total_results
0	New South Wales	88834269
1	Victoria	48013836
2	Queensland	41131526
3	South Australia	25381813
4	Western Australia	25369631
5	Tasmania	15850246
6	ACT	3296997
7	International	854022
8	National	663996
9	Northern Territory	409433