The use of standard licences and rights statements in Trove image records¶
Version 2.1 of the Trove API introduced a new rights
index that you can use to limit your search results to records that include one of the licences and rights statements listed on this page. We can also use this index to build a picture of which rights statements are currently being used, and by who. Let's give it a try...
The method used here is to:
- Retrieve details of Trove contributors from the API
- Loop through the contributors, then loop through all the licences/rights statements, firing off a search in the
picture
zone for each combination. - Save the total results for each query with the contributor and licence details.
So for every organisation that contributes records to Trove, we'll find out the number of image records that include each rights statement.
Problems:
Searching by contributor saves us having to harvest all the images, but it has a major problem. Sometimes Trove will group multiple versions of a picture held by different organisations as a single work. Rights information is saved in the version metadata, but searches only return works. So if one organisation has assigned a rights statement to a version of the image, it will look like all the organisations whose images are grouped together with it as a work are using that rights statement. I don't think this will make a huge difference to the results, but it will be something to look out for. The only way around this is to harvest everything and expand the versions out into separate record.
The
rights
index doesn't currently seem to include information on out of copyright images, unless they've actually been marked using the 'Public Domain' statement by the institution. Common statements such as 'Out of copyright', 'No known copyright restrictions', or 'Copyright expired' return no results. So there's a lot more open images than are currently reported by the rights index.
import os
import pandas as pd
import requests_cache
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.notebook import tqdm
# Create a session that will automatically retry on server errors
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
load_dotenv()
True
# These are all the licence/rights statements recognised by Trove
# Copied from https://help.nla.gov.au/trove/becoming-partner/for-content-partners/licensing-reuse
licences = [
"Free/CC Public Domain",
"Free/CC BY",
"Free/CC0",
"Free/RS NKC",
"Free/RS Noc-US",
"Free with conditions/CC BY-ND",
"Free with conditions/CC BY-SA",
"Free with conditions/CC BY-NC",
"Free with conditions/CC BY-NC-ND",
"Free with conditions/CC BY-NC-SA",
"Free with conditions/RS NoC-NC",
"Free with conditions/InC-NC",
"Free with conditions/InC-EDU",
"Restricted/RS InC",
"Restricted/RS InC-OW-EU",
"Restricted/RS InC-RUU",
"Restricted/RS CNE",
"Restricted/RS UND",
"Restricted/NoC-CR",
"Restricted/NoC-OKLR",
]
# Insert your Trove API key
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
HEADERS = {"X-API-KEY": API_KEY}
def save_summary(contributors, record, parent=None):
"""
Extract basic data from contributor record, and traverse any child records.
Create a full_name value by combining parent and child names.
"""
summary = {"id": record["id"], "name": record["name"]}
if parent:
summary["parent_id"] = parent["id"]
summary["full_name"] = f'{parent["full_name"]} / {record["name"]}'
elif "parent" in record:
summary["parent_id"] = record["parent"]["id"]
summary["full_name"] = f'{record["parent"]["name"]} / {record["name"]}'
else:
summary["full_name"] = record["name"]
if "children" in record:
for child in record["children"]:
save_summary(contributors, child, summary)
contributors.append(summary)
def get_contributors():
"""
Get a list of contributors form the Trove API.
Flatten all the nested records.
"""
contributors = []
contrib_params = {"key": API_KEY, "encoding": "json", "reclevel": "full"}
response = s.get(
"https://api.trove.nla.gov.au/v3/contributor/",
params=contrib_params,
headers=HEADERS,
timeout=60,
)
data = response.json()
for record in data["contributor"]:
save_summary(contributors, record)
return contributors
def contributor_has_results(contrib, params, additional_query):
"""
Check to see is the query return any results for this contributor.
"""
query = f'nuc:"{contrib["id"]}"'
# Add any extra queries
if additional_query:
query += f" {additional_query}"
params["q"] = query
response = s.get(
"https://api.trove.nla.gov.au/v3/result",
params=params,
headers=HEADERS,
timeout=60,
)
data = response.json()
total = int(data["category"][0]["records"]["total"])
if total > 0:
return True
def licence_counts_by_institution(additional_query=None):
"""
Loop through contributors and licences to harvest data about the number of times each licence is used.
"""
contributors = get_contributors()
licence_counts = []
params = {"encoding": "json", "category": "image", "n": 0}
for contrib in tqdm(contributors):
# If there are no results for this contributor then there's no point checking for licences
# This should save a bit of time
if contributor_has_results(contrib, params, additional_query):
contrib_row = contrib.copy()
# Only search for nuc ids that start with a letter
if contrib["id"][0].isalpha():
for licence in licences:
# Construct query using nuc id and licence
query = f'nuc:"{contrib["id"]}" rights:"{licence}"'
# Add any extra queries
if additional_query:
query += f" {additional_query}"
params["q"] = query
response = s.get(
"https://api.trove.nla.gov.au/v3/result",
params=params,
headers=HEADERS,
timeout=60,
)
data = response.json()
total = data["category"][0]["records"]["total"]
contrib_row[licence] = int(total)
# print(contrib_row)
licence_counts.append(contrib_row)
return licence_counts
licence_counts_not_books = licence_counts_by_institution('NOT format:"Book"')
Process the data¶
df = pd.DataFrame(licence_counts_not_books)
# Fill empty totals with zeros & make them all integers
df[licences] = df[licences].fillna(0).astype(int)
# Check the overall distribution of rights statements
df.sum(numeric_only=True)
Free/CC Public Domain 308691 Free/CC BY 171779 Free/CC0 2130 Free/RS NKC 5892 Free/RS Noc-US 0 Free with conditions/CC BY-ND 0 Free with conditions/CC BY-SA 13045 Free with conditions/CC BY-NC 23991 Free with conditions/CC BY-NC-ND 25022 Free with conditions/CC BY-NC-SA 125873 Free with conditions/RS NoC-NC 0 Free with conditions/InC-NC 0 Free with conditions/InC-EDU 4639 Restricted/RS InC 14613 Restricted/RS InC-OW-EU 0 Restricted/RS InC-RUU 1 Restricted/RS CNE 12868 Restricted/RS UND 415 Restricted/NoC-CR 0 Restricted/NoC-OKLR 0 dtype: int64
# Remove columns we don't need
df_final = df[["id", "full_name"] + licences]
# Remove rows that add up to zero
df_final = df_final.loc[(df_final.sum(axis=1, numeric_only=True) != 0)]
# Remove columns that are all zero
df_final = df_final.loc[:, df_final.any()]
# Sort by name and save as CSV
df_final.sort_values(by=["full_name"]).to_csv("rights-on-images.csv", index=False)
Are there any licences applied to out-of-copyright images?¶
Some GLAM institutions apply restrictive licences to digitised versions of out-of-copyright images. Under Australian copyright law, photographs created before 1955 are out of copyright, so we can adjust our query and look to see what sorts of rights statements are attached to them.
licence_counts_out_of_copyright = licence_counts_by_institution(
"format:Photograph date:[* TO 1954]"
)
df2 = pd.DataFrame(licence_counts_out_of_copyright)
# Fill empty totals with zeros & make them all integers
df2[licences] = df2[licences].fillna(0).astype(int)
# Check the overall distribution of rights statements
df2.sum(numeric_only=True)
Free/CC Public Domain 30088 Free/CC BY 15017 Free/CC0 653 Free/RS NKC 1537 Free/RS Noc-US 0 Free with conditions/CC BY-ND 0 Free with conditions/CC BY-SA 934 Free with conditions/CC BY-NC 84 Free with conditions/CC BY-NC-ND 829 Free with conditions/CC BY-NC-SA 1412 Free with conditions/RS NoC-NC 0 Free with conditions/InC-NC 0 Free with conditions/InC-EDU 2 Restricted/RS InC 128 Restricted/RS InC-OW-EU 0 Restricted/RS InC-RUU 0 Restricted/RS CNE 572 Restricted/RS UND 2 Restricted/NoC-CR 0 Restricted/NoC-OKLR 0 dtype: int64
# Remove columns we don't need
df2_final = df2[["id", "full_name"] + licences]
# Remove rows that add up to zero
df2_final = df2_final.loc[(df2_final.sum(axis=1, numeric_only=True) != 0)]
# Remove columns that are all zero
df2_final = df2_final.loc[:, df2_final.any()]
# Sort by name and save as CSV
df2_final.sort_values(by=["full_name"]).to_csv(
"rights-on-out-of-copyright-photos.csv", index=False
)
# IGNORE THIS CELL -- FOR TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
def get_contributors_sample():
"""
Get a sample of contributors from the Trove API for testing.
Flatten all the nested records.
"""
contributors = []
contrib_params = {"key": API_KEY, "encoding": "json", "reclevel": "full"}
response = s.get(
"https://api.trove.nla.gov.au/v3/contributor/",
params=contrib_params,
headers=HEADERS,
timeout=60,
)
data = response.json()
for record in data["contributor"]:
save_summary(contributors, record)
return contributors[:10]
get_contributors = get_contributors_sample
licence_counts_not_books = licence_counts_by_institution('NOT format:"Book"')
df = pd.DataFrame(licence_counts_not_books)
licence_counts_out_of_copyright = licence_counts_by_institution(
"format:Photograph date:[* TO 1954]"
)
df2 = pd.DataFrame(licence_counts_out_of_copyright)
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.