import os

import pandas as pd
import requests_cache
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.notebook import tqdm

# Create a session that will automatically retry on server errors
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# These are all the licence/rights statements recognised by Trove
# Copied from https://help.nla.gov.au/trove/becoming-partner/for-content-partners/licensing-reuse
licences = [
    "Free/CC Public Domain",
    "Free/CC BY",
    "Free/CC0",
    "Free/RS NKC",
    "Free/RS Noc-US",
    "Free with conditions/CC BY-ND",
    "Free with conditions/CC BY-SA",
    "Free with conditions/CC BY-NC",
    "Free with conditions/CC BY-NC-ND",
    "Free with conditions/CC BY-NC-SA",
    "Free with conditions/RS NoC-NC",
    "Free with conditions/InC-NC",
    "Free with conditions/InC-EDU",
    "Restricted/RS InC",
    "Restricted/RS InC-OW-EU",
    "Restricted/RS InC-RUU",
    "Restricted/RS CNE",
    "Restricted/RS UND",
    "Restricted/NoC-CR",
    "Restricted/NoC-OKLR",
]

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

HEADERS = {"X-API-KEY": API_KEY}

def save_summary(contributors, record, parent=None):
    """
    Extract basic data from contributor record, and traverse any child records.
    Create a full_name value by combining parent and child names.
    """
    summary = {"id": record["id"], "name": record["name"]}
    if parent:
        summary["parent_id"] = parent["id"]
        summary["full_name"] = f'{parent["full_name"]} / {record["name"]}'
    elif "parent" in record:
        summary["parent_id"] = record["parent"]["id"]
        summary["full_name"] = f'{record["parent"]["name"]} / {record["name"]}'
    else:
        summary["full_name"] = record["name"]
    if "children" in record:
        for child in record["children"]:
            save_summary(contributors, child, summary)
    contributors.append(summary)


def get_contributors():
    """
    Get a list of contributors form the Trove API.
    Flatten all the nested records.
    """
    contributors = []
    contrib_params = {"key": API_KEY, "encoding": "json", "reclevel": "full"}
    response = s.get(
        "https://api.trove.nla.gov.au/v3/contributor/",
        params=contrib_params,
        headers=HEADERS,
        timeout=60,
    )
    data = response.json()
    for record in data["contributor"]:
        save_summary(contributors, record)
    return contributors

def contributor_has_results(contrib, params, additional_query):
    """
    Check to see is the query return any results for this contributor.
    """
    query = f'nuc:"{contrib["id"]}"'
    # Add any extra queries
    if additional_query:
        query += f" {additional_query}"
    params["q"] = query
    response = s.get(
        "https://api.trove.nla.gov.au/v3/result",
        params=params,
        headers=HEADERS,
        timeout=60,
    )

    data = response.json()
    total = int(data["category"][0]["records"]["total"])
    if total > 0:
        return True


def licence_counts_by_institution(additional_query=None):
    """
    Loop through contributors and licences to harvest data about the number of times each licence is used.
    """
    contributors = get_contributors()
    licence_counts = []
    params = {"encoding": "json", "category": "image", "n": 0}

    for contrib in tqdm(contributors):
        # If there are no results for this contributor then there's no point checking for licences
        # This should save a bit of time
        if contributor_has_results(contrib, params, additional_query):
            contrib_row = contrib.copy()
            # Only search for nuc ids that start with a letter
            if contrib["id"][0].isalpha():
                for licence in licences:
                    # Construct query using nuc id and licence
                    query = f'nuc:"{contrib["id"]}" rights:"{licence}"'
                    # Add any extra queries
                    if additional_query:
                        query += f" {additional_query}"
                    params["q"] = query
                    response = s.get(
                        "https://api.trove.nla.gov.au/v3/result",
                        params=params,
                        headers=HEADERS,
                        timeout=60,
                    )
                    data = response.json()
                    total = data["category"][0]["records"]["total"]
                    contrib_row[licence] = int(total)
            # print(contrib_row)
            licence_counts.append(contrib_row)
    return licence_counts

licence_counts_not_books = licence_counts_by_institution('NOT format:"Book"')

df = pd.DataFrame(licence_counts_not_books)

# Fill empty totals with zeros & make them all integers
df[licences] = df[licences].fillna(0).astype(int)

# Check the overall distribution of rights statements
df.sum(numeric_only=True)

Free/CC Public Domain               308691
Free/CC BY                          171779
Free/CC0                              2130
Free/RS NKC                           5892
Free/RS Noc-US                           0
Free with conditions/CC BY-ND            0
Free with conditions/CC BY-SA        13045
Free with conditions/CC BY-NC        23991
Free with conditions/CC BY-NC-ND     25022
Free with conditions/CC BY-NC-SA    125873
Free with conditions/RS NoC-NC           0
Free with conditions/InC-NC              0
Free with conditions/InC-EDU          4639
Restricted/RS InC                    14613
Restricted/RS InC-OW-EU                  0
Restricted/RS InC-RUU                    1
Restricted/RS CNE                    12868
Restricted/RS UND                      415
Restricted/NoC-CR                        0
Restricted/NoC-OKLR                      0
dtype: int64

# Remove columns we don't need
df_final = df[["id", "full_name"] + licences]

# Remove rows that add up to zero
df_final = df_final.loc[(df_final.sum(axis=1, numeric_only=True) != 0)]

# Remove columns that are all zero
df_final = df_final.loc[:, df_final.any()]

# Sort by name and save as CSV
df_final.sort_values(by=["full_name"]).to_csv("rights-on-images.csv", index=False)

licence_counts_out_of_copyright = licence_counts_by_institution(
    "format:Photograph date:[* TO 1954]"
)

df2 = pd.DataFrame(licence_counts_out_of_copyright)

# Fill empty totals with zeros & make them all integers
df2[licences] = df2[licences].fillna(0).astype(int)

# Check the overall distribution of rights statements
df2.sum(numeric_only=True)

Free/CC Public Domain               30088
Free/CC BY                          15017
Free/CC0                              653
Free/RS NKC                          1537
Free/RS Noc-US                          0
Free with conditions/CC BY-ND           0
Free with conditions/CC BY-SA         934
Free with conditions/CC BY-NC          84
Free with conditions/CC BY-NC-ND      829
Free with conditions/CC BY-NC-SA     1412
Free with conditions/RS NoC-NC          0
Free with conditions/InC-NC             0
Free with conditions/InC-EDU            2
Restricted/RS InC                     128
Restricted/RS InC-OW-EU                 0
Restricted/RS InC-RUU                   0
Restricted/RS CNE                     572
Restricted/RS UND                       2
Restricted/NoC-CR                       0
Restricted/NoC-OKLR                     0
dtype: int64

# Remove columns we don't need
df2_final = df2[["id", "full_name"] + licences]

# Remove rows that add up to zero
df2_final = df2_final.loc[(df2_final.sum(axis=1, numeric_only=True) != 0)]

# Remove columns that are all zero
df2_final = df2_final.loc[:, df2_final.any()]

# Sort by name and save as CSV
df2_final.sort_values(by=["full_name"]).to_csv(
    "rights-on-out-of-copyright-photos.csv", index=False
)

# IGNORE THIS CELL -- FOR TESTING ONLY

if os.getenv("GW_STATUS") == "dev":

    def get_contributors_sample():
        """
        Get a sample of contributors from the Trove API for testing.
        Flatten all the nested records.
        """
        contributors = []
        contrib_params = {"key": API_KEY, "encoding": "json", "reclevel": "full"}
        response = s.get(
            "https://api.trove.nla.gov.au/v3/contributor/",
            params=contrib_params,
            headers=HEADERS,
            timeout=60,
        )
        data = response.json()
        for record in data["contributor"]:
            save_summary(contributors, record)
        return contributors[:10]

    get_contributors = get_contributors_sample

    licence_counts_not_books = licence_counts_by_institution('NOT format:"Book"')

    df = pd.DataFrame(licence_counts_not_books)

    licence_counts_out_of_copyright = licence_counts_by_institution(
        "format:Photograph date:[* TO 1954]"
    )

    df2 = pd.DataFrame(licence_counts_out_of_copyright)

The use of standard licences and rights statements in Trove image records¶

Process the data¶

Are there any licences applied to out-of-copyright images?¶