import os
import time
from operator import itemgetter  # used for sorting

import altair as alt
import pandas as pd  # makes manipulating the data easier
import requests
from dotenv import load_dotenv
from IPython.display import clear_output
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

# Make sure data directory exists
os.makedirs("data", exist_ok=True)

# Create a session that will automatically retry on server errors
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

# Basic parameters for Trove API
params = {
    "facet": "year",  # Get the data aggregated by year.
    "category": "newspaper",
    "l-artType": "newspaper",
    "encoding": "json",
    "n": 0,  # We don't need any records, just the facets!
}

headers = {"X-API-KEY": API_KEY}

# CHANGE THIS TO SEARCH FOR SOMETHING ELSE!
params["q"] = "radio"

def get_results(params):
    """
    Get JSON response data from the Trove API.
    Parameters:
        params
    Returns:
        JSON formatted response data from Trove API
    """
    response = s.get(
        "https://api.trove.nla.gov.au/v3/result",
        params=params,
        headers=headers,
        timeout=30,
    )
    response.raise_for_status()
    # print(response.url) # This shows us the url that's sent to the API
    data = response.json()
    return data


def get_facets(data):
    """
    Loop through facets in Trove API response, saving terms and counts.
    Parameters:
        data  - JSON formatted response data from Trove API
    Returns:
        A list of dictionaries containing: 'year', 'total_results'
    """
    facets = []
    try:
        # The facets are buried a fair way down in the results
        # Note that if you ask for more than one facet, you'll have use the facet['name'] param to find the one you want
        # In this case there's only one facet, so we can just grab the list of terms (which are in fact the results by year)
        for term in data["category"][0]["facets"]["facet"][0]["term"]:

            # Get the year and the number of results, and convert them to integers, before adding to our results
            facets.append(
                {"year": int(term["search"]), "total_results": int(term["count"])}
            )

        # Sort facets by year
        facets.sort(key=itemgetter("year"))
    except (TypeError, KeyError):
        pass
    return facets

def get_facet_data(params, start_decade=180, end_decade=201):
    """
    Loop throught the decades from 'start_decade' to 'end_decade',
    getting the number of search results for each year from the year facet.
    Combine all the results into a single list.
    Parameters:
        params - parameters to send to the API
        start_decade
        end_decade
    Returns:
        A list of dictionaries containing 'year', 'total_results' for the complete
        period between the start and end decades.
    """
    # Create a list to hold the facets data
    facet_data = []

    # Loop through the decades
    for decade in tqdm(range(start_decade, end_decade + 1)):

        # Avoid confusion by copying the params before we change anything.
        search_params = params.copy()

        # Add decade value to params
        search_params["l-decade"] = decade

        # Get the data from the API
        data = get_results(search_params)

        # Get the facets from the data and add to facets_data
        facet_data += get_facets(data)

        # Try not to go over API rate limit - increase if you get 403 errors
        time.sleep(0.2)

    # Reomve the progress bar (you can also set leave=False in tqdm, but that still leaves white space in Jupyter Lab)
    clear_output()
    return facet_data

# Call the function and save the results to a variable called facet_data
facet_data = get_facet_data(params)

# Convert our data to a dataframe called df
df = pd.DataFrame(facet_data)

# Let's have a look at the first few rows of data
df.head()

# Show the row that has the highest value in the 'total_results' column.
# Use .idxmax to find the row with the highest value, then use .loc to get it
df.loc[df["total_results"].idxmax()]

year               1954
total_results    161172
Name: 124, dtype: int64

alt.Chart(df).mark_line(point=True).encode(
    # Years on the X axis
    x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
    # Number of articles on the Y axis
    y=alt.Y("total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")),
    # Display details when you hover over a point
    tooltip=[
        alt.Tooltip("year:Q", title="Year"),
        alt.Tooltip("total_results:Q", title="Articles", format=","),
    ],
).properties(width=700, height=400)

# Reset the 'q' parameter
del params["q"]

# Get facet data for all articles
all_facet_data = get_facet_data(params)

# Convert the results to a dataframe
df_total = pd.DataFrame(all_facet_data)

# Make a chart
alt.Chart(df_total).mark_line(point=True).encode(
    # Display the years along the X axis
    x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
    # Display the number of results on the Y axis (formatted using thousands separator)
    y=alt.Y("total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")),
    # Create a tooltip when you hover over a point to show the data for that year
    tooltip=[
        alt.Tooltip("year:Q", title="Year"),
        alt.Tooltip("total_results:Q", title="Articles", format=","),
    ],
).properties(width=700, height=400)

def merge_df_with_total(df, df_total):
    """
    Merge dataframes containing search results with the total number of articles by year.
    This is a left join on the year column. The total number of articles will be added as a column to
    the existing results.
    Once merged, do some reorganisation and calculate the proportion of search results.
    Parameters:
        df - the search results in a dataframe
        df_total - total number of articles per year in a dataframe
    Returns:
        A dataframe with the following columns - 'year', 'total_results', 'total_articles', 'proportion'
        (plus any other columns that are in the search results dataframe).
    """
    # Merge the two dataframes on year
    # Note that we're joining the two dataframes on the year column
    df_merged = pd.merge(df, df_total, how="left", on="year")

    # Rename the columns for convenience
    df_merged.rename(
        {"total_results_y": "total_articles"}, inplace=True, axis="columns"
    )
    df_merged.rename({"total_results_x": "total_results"}, inplace=True, axis="columns")

    # Set blank values to zero to avoid problems
    df_merged["total_results"] = df_merged["total_results"].fillna(0).astype(int)

    # Calculate proportion by dividing the search results by the total articles
    df_merged["proportion"] = df_merged["total_results"] / df_merged["total_articles"]
    return df_merged

# Merge the search results with the total articles
df_merged = merge_df_with_total(df, df_total)
df_merged.head()

# This is the chart showing raw results -- it's the same as the one we created above (but a bit smaller)
chart1 = (
    alt.Chart(df)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
        y=alt.Y(
            "total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")
        ),
        tooltip=[
            alt.Tooltip("year:Q", title="Year"),
            alt.Tooltip("total_results:Q", title="Articles", format=","),
        ],
    )
    .properties(width=700, height=250)
)

# This is the new view, note that it's using the 'proportion' column for the Y axis
chart2 = (
    alt.Chart(df_merged)
    .mark_line(point=True, color="red")
    .encode(
        x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
        # This time we're showing the proportion (formatted as a percentage) on the Y axis
        y=alt.Y(
            "proportion:Q", axis=alt.Axis(format="%", title="Proportion of articles")
        ),
        tooltip=[
            alt.Tooltip("year:Q", title="Year"),
            alt.Tooltip("proportion:Q", title="Proportion", format="%"),
        ],
        # Make the charts different colors
        color=alt.value("orange"),
    )
    .properties(width=700, height=250)
)

# This is a shorthand way of stacking the charts on top of each other
chart1 & chart2

# Create a list of queries
queries = ["telegraph", "radio", "wireless"]

def get_search_facets(params, queries):
    """
    Process a list of search queries, gathering the facet data for each and combining the results into a single dataframe.

    Parameters:
        params - basic parameters to send to the API
        queries - a list of search queries
    Returns:
        A dataframe
    """
    # This is where we'll store the invididual dataframes
    dfs = []

    # Make a copy of the basic parameters
    these_params = params.copy()

    # Loop through the list of queries
    for q in queries:

        # Set the 'q' parameter to the current search query
        these_params["q"] = q

        # Get all the facet data for this search
        facet_data = get_facet_data(these_params)

        # Convert the facet data into a dataframe
        df = pd.DataFrame(facet_data)

        # Add a column with the search query -- this will enable us to distinguish between the results in the combined dataframe.
        df["query"] = q

        # Add this df to our list
        dfs.append(df)

    # Combine the dfs into one df using concat and return the result
    return pd.concat(dfs)

df_queries = get_search_facets(params, queries)

df_queries_merged = merge_df_with_total(df_queries, df_total)

def make_chart_totals(df, category, category_title):
    """
    Make a chart showing the raw number of search results over time.
    Creates different coloured lines for each query or category.
    Parameters:
        df - a dataframe
        category - the column containing the value that distinguishes multiple results set (eg 'query' or 'state')
        category_title - a nicely formatted title for the category to appear above the legend
    """
    chart = (
        alt.Chart(df)
        .mark_line(point=True)
        .encode(
            # Show the year on the X axis
            x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
            # Show the total number of articles on the Y axis (with thousands separator)
            y=alt.Y(
                "total_results:Q",
                axis=alt.Axis(format=",d", title="Number of articles"),
            ),
            # Display query/category, year, and number of results on hover
            tooltip=[
                alt.Tooltip("{}:N".format(category), title=category_title),
                alt.Tooltip("year:Q", title="Year"),
                alt.Tooltip("total_results:Q", title="Articles", format=","),
            ],
            # In these charts were comparing results, so we're using color to distinguish between queries/categories
            color=alt.Color(
                "{}:N".format(category), legend=alt.Legend(title=category_title)
            ),
        )
        .properties(width=700, height=250)
    )
    return chart


def make_chart_proportions(df, category, category_title):
    """
    Make a chart showing the proportion of search results over time.
    Creates different coloured lines for each query or category.
    Parameters:
        df - a dataframe
        category - the column containing the value that distinguishes multiple results set (eg 'query' or 'state')
        category_title - a nicely formatted title for the category to appear above the legend
    """
    chart = (
        alt.Chart(df)
        .mark_line(point=True)
        .encode(
            # Show the year on the X axis
            x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
            # Show the proportion of articles on the Y axis (formatted as percentage)
            y=alt.Y(
                "proportion:Q",
                axis=alt.Axis(format="%", title="Proportion of articles"),
                stack=None,
            ),
            # Display query/category, year, and proportion of results on hover
            tooltip=[
                alt.Tooltip("{}:N".format(category), title=category_title),
                alt.Tooltip("year:Q", title="Year"),
                alt.Tooltip("proportion:Q", title="Proportion", format="%"),
            ],
            # In these charts were comparing results, so we're using color to distinguish between queries/categories
            color=alt.Color(
                "{}:N".format(category), legend=alt.Legend(title=category_title)
            ),
        )
        .properties(width=700, height=250)
    )
    return chart

# Chart total results
chart3 = make_chart_totals(df_queries_merged, "query", "Search query")

# Chart proportions
chart4 = make_chart_proportions(df_queries_merged, "query", "Search query")

# Shorthand way of concatenating the two charts (note there's only one legend)
chart3 & chart4

# A list of state values that we'll supply to the state facet
states = ["New South Wales", "Victoria"]

# Remember this time we're comparing a single search query across multiple states
query = "Chinese"

def get_state_totals(state):
    """
    Get the total number of articles for each year for the specified state.
    Parameters:
        state
    Returns:
        A list of dictionaries containing 'year', 'total_results'.
    """
    these_params = params.copy()

    # Set the q parameter to a single space to get everything
    these_params.pop("q", None)

    # Set the state facet to the given state value
    these_params["l-state"] = state

    # Get the year by year data
    facet_data = get_facet_data(these_params)
    return facet_data


def get_state_facets(params, states, query):
    """
    Loop through the supplied list of states searching for the specified query and getting the year by year results.
    Merges the search results with the total number of articles for that state.
    Parameters:
        params - basic parameters to send to the API
        states - a list of states to apply using the state facet
        query - the search query to use
    Returns:
        A dataframe
    """
    dfs = []
    these_params = params.copy()

    # Set the q parameter to the supplied query
    these_params["q"] = query

    # Loop through the supplied list of states
    for state in states:

        # Set the state facet to the current state value
        these_params["l-state"] = state

        # Get year facets for this state & query
        facet_data = get_facet_data(these_params)

        # Convert the results to a dataframe
        df = pd.DataFrame(facet_data)

        # Get the total number of articles per year for this state
        total_data = get_state_totals(state)

        # Convert the totals to a dataframe
        df_total = pd.DataFrame(total_data)

        # Merge the two dataframes
        df_merged = merge_df_with_total(df, df_total)

        # Add a state column to the dataframe and set its value to the current state
        df_merged["state"] = state

        # Add this df to the list of dfs
        dfs.append(df_merged)

    # Concatenate all the dataframes and return the result
    return pd.concat(dfs)

df_states = get_state_facets(params, states, query)

# Chart totals
chart5 = make_chart_totals(df_states, "state", "State")

# Chart proportions
chart6 = make_chart_proportions(df_states, "state", "State")

# Shorthand way of concatenating the two charts (note there's only one legend)
chart5 & chart6

# Create a list of dictionaries, each with the 'id' and 'name' of a newspaper
newspapers = [
    {"id": 1180, "name": "Sydney Sun"},
    {"id": 35, "name": "Sydney Morning Herald"},
    {"id": 1002, "name": "Tribune"},
]

# Our search query we want to compare across newspapers
query = "worker"

def get_newspaper_totals(newspaper_id):
    """
    Get the total number of articles for each year for the specified newspaper.
    Parameters:
        newspaper_id - numeric Trove newspaper identifier
    Returns:
        A list of dictionaries containing 'year', 'total_results'.
    """
    these_params = params.copy()

    # Set q to a single space for everything
    these_params.pop("q", None)

    # Set the title facet to the newspaper_id
    these_params["l-title"] = newspaper_id

    # Get all the year by year data
    facet_data = get_facet_data(these_params)
    return facet_data


def get_newspaper_facets(params, newspapers, query):
    """
    Loop through the supplied list of newspapers searching for the specified query and getting the year by year results.
    Merges the search results with the total number of articles for that newspaper.
    Parameters:
        params - basic parameters to send to the API
        newspapers - a list of dictionaries with the id and name of a newspaper
        query - the search query to use
    Returns:
        A dataframe
    """
    dfs = []
    these_params = params.copy()

    # Set the query
    these_params["q"] = query

    # Loop through the list of newspapers
    for newspaper in newspapers:

        # Sewt the title facet to the id of the current newspaper
        these_params["l-title"] = newspaper["id"]

        # Get the year by year results for this newspaper
        facet_data = get_facet_data(these_params)

        # Convert to a dataframe
        df = pd.DataFrame(facet_data)

        # Get the total number of articles published in this newspaper per year
        total_data = get_newspaper_totals(newspaper["id"])

        # Convert to a dataframe
        df_total = pd.DataFrame(total_data)

        # Merge the two dataframes
        df_merged = merge_df_with_total(df, df_total)

        # Create a newspaper column and set its value to the name of the newspaper
        df_merged["newspaper"] = newspaper["name"]

        # Add the current datarame to the list
        dfs.append(df_merged)

    # Concatenate the dataframes and return the result
    return pd.concat(dfs)

df_newspapers = get_newspaper_facets(params, newspapers, query)

# Chart totals
chart7 = make_chart_totals(df_newspapers, "newspaper", "Newspaper")

# Chart proportions
chart8 = make_chart_proportions(df_newspapers, "newspaper", "Newspaper")

# Shorthand way of concatenating the two charts (note there's only one legend)
chart7 & chart8

ill_types = ["Photo", "Cartoon", "Illustration", "Map", "Graph"]

def get_ill_facets(params, ill_types):
    """
    Loop through the supplied list of illustration types getting the year by year results.
    Parameters:
        params - basic parameters to send to the API
        ill_types - a list of illustration types to use with the ill_type facet
    Returns:
        A dataframe
    """
    dfs = []
    ill_params = params.copy()

    # No query! Set q to a single space for everything
    ill_params["q"] = " "

    # Set the illustrated facet to true - necessary before setting ill_type
    ill_params["l-illustrated"] = "true"

    # Loop through the illustration types
    for ill_type in ill_types:

        # Set the ill_type facet to the current illustration type
        ill_params["l-illustrationType"] = ill_type

        # Get the year by year data
        facet_data = get_facet_data(ill_params)

        # Convert to a dataframe
        df = pd.DataFrame(facet_data)

        # Create an ill_type column and set its value to the illustration type
        df["ill_type"] = ill_type

        # Add current df to the list of dfs
        dfs.append(df)

    # Concatenate all the dfs and return the result
    return pd.concat(dfs)

df_illtypes = get_ill_facets(params, ill_types)

# Merge results with total articles and calculate proportions
df_illtypes_merged = merge_df_with_total(df_illtypes, df_total)

# Make total results chart
chart9 = make_chart_totals(df_illtypes_merged, "ill_type", "Type")

# Make proportions chart
chart10 = make_chart_proportions(df_illtypes_merged, "ill_type", "Type")

# Shorthand way of concatenating the two charts (note there's only one legend)
chart9 & chart10

ill_params = params.copy()

# No query! Set q to a single space for everything
ill_params.pop("q", None)

# Set the illustrated facet to true - necessary before setting ill_type
ill_params["l-illustrated"] = "true"
ill_params["facet"] = "illustrationType"
data = get_results(ill_params)
facets = []
for term in data["category"][0]["facets"]["facet"][0]["term"]:
    # Get the state and the number of results, and convert it to integers, before adding to our results
    facets.append({"ill_type": term["search"], "total_results": int(term["count"])})
df_ill_types = pd.DataFrame(facets)
df_ill_types

params["q"] = 'text:"tbe"~0'

ocr_facets = get_facet_data(params)

df_ocr = pd.DataFrame(ocr_facets)
df_ocr_merged = merge_df_with_total(df_ocr, df_total)

alt.Chart(df_ocr_merged).mark_line(point=True).encode(
    x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
    # This time we're showing the proportion (formatted as a percentage) on the Y axis
    y=alt.Y("proportion:Q", axis=alt.Axis(format="%", title="Proportion of articles")),
    tooltip=[
        alt.Tooltip("year:Q", title="Year"),
        alt.Tooltip("proportion:Q", title="Proportion", format="%"),
    ],
).properties(width=700, height=400)

	year	total_results
0	1830	1
1	1831	1
2	1832	1
3	1833	2
4	1834	5

	year	total_results	total_articles	proportion
0	1830	1	8977	0.000111
1	1831	1	10989	0.000091
2	1832	1	14814	0.000068
3	1833	2	15622	0.000128
4	1834	5	18704	0.000267

Visualise Trove newspaper searches over time¶

1. Setting things up¶

Import what we need¶

Enter a Trove API key¶

2. Find the number of articles per year using facets¶

3. How many articles in total were published each year?¶

4. Charting our search results as a proportion of total articles¶

5. Comparing multiple search terms over time¶

6. Comparing a search term across different states¶

7. Comparing a search term across different newspapers¶

8. Chart changes in illustration types over time¶

9. But what are we searching?¶

10. Next steps¶

QueryPic¶

Visualise a search in Papers Past¶

Trove Newspaper Harvester¶

12. Further reading¶

	ill_type	total_results
0	Photo	7523021
1	Illustration	2992908
2	Cartoon	1202747
3	Map	334039
4	Cartoons	68216
5	Graph	32651
6	Chart	20
7	Unknown	9
8	Diagram	5

Visualise Trove newspaper searches over time¶

1. Setting things up¶

Import what we need¶

Enter a Trove API key¶

2. Find the number of articles per year using facets¶

3. How many articles in total were published each year?¶

4. Charting our search results as a proportion of total articles¶

5. Comparing multiple search terms over time¶

6. Comparing a search term across different states¶

7. Comparing a search term across different newspapers¶

8. Chart changes in illustration types over time¶

9. But what are we searching?¶

10. Next steps¶

11. Related resources¶

QueryPic¶

Visualise a search in Papers Past¶

Trove Newspaper Harvester¶

12. Further reading¶