Download a collection of digitised images¶
Digitised photographs and other images are often organised into collections. While the Trove web interface does include a download option for collections, it has a number of limitations:
- the images are all combined into a single zip file
- you can generally download a maximum of 20 images at a time
- the resolution of the downloaded images is often quite low
This notebook provides an alternative method that downloads all of the available images in a collection (and any sub-collections) at the highest available resolution. The method is as follows:
- the
nla.obj
identifiers for all the items in the collection are harvested from the browse interface - a url to download a high-resolution version of the image is constructed using each
nla.obj
id - each image is downloaded and saved
The downloaded images will be saved in the images/[COLLECTION ID]
folder. Once the harvest is complete, the dataset will be zipped up with an RO-Crate metadata file and a link displayed for easy download. The RO-Crate metadata file captures the context and results of the harvest.
The image file names use the nla.obj
identifiers. For example, the image of nla.obj-147116797
is saved as nla.obj-147116797.jpg
. The identifiers also link the image back to the website: nla.obj-147116797.jpg
comes from https://nla.gov.au/nla.obj-147116797
.
Finding collections of images¶
There's no direct way of searching for collections, they tend to be mixed up in search results with individual images. Not all digitised images are in collections, but if they are you can use the breadcrumbs navigation to move up the hierarchy. Each level in the collection hierarchy will have it's own nla.obj
identifier that you can use to download images from that level and below.
For example, this excellent poster is part of a very large collection of digitised posters and exists at the bottom of the breadcrumb hierarchy: Home > Guide to Pre-1950 Advertising Posters in the National Library of Australia digitised by the 2019 Tax Time Appeal > Poster drawers > Posters. Clicking on 'Guide to Pre-1950 Advertising Posters', 'Poster drawers', or 'Posters' will take you to different levels in the collection hierarchy. You can then just copy the nla.obj
identifier from the url and paste it below to download all child images.
import json
import mimetypes
import os
import re
import time
from datetime import datetime, timedelta
from pathlib import Path
import ipynbname
import nbformat
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from rocrate.rocrate import ContextEntity, ROCrate
from tqdm.auto import tqdm
s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
load_dotenv()
def prepare_url(url):
"""
Make sure nla.obj identifiers are properly formatted urls.
"""
url = re.sub(r"https?://nla/", "https://nla.gov.au/", url)
url = url.replace("\\\\", "//")
if not url.startswith("http"):
# print(url)
url = f"https://nla.gov.au/{url.strip('/')}"
return url
def get_work_data(url):
"""
Extract work data in a JSON string from the work's HTML page.
"""
url = prepare_url(url)
try:
response = s.get(url)
except ConnectionError:
print(url)
if response.ok:
try:
work_data = re.search(
r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
).group(1)
except AttributeError:
work_data = "{}"
else:
print(url)
work_data = "{}"
if not response.from_cache:
time.sleep(0.2)
return json.loads(work_data)
def harvest_collection_items(collection_id, include_subcollections=False):
"""
Harvest all the items in a Trove collection (including any sub-collections)
by scraping the item identifiers from the 'Browse collection' pop-up.
See the Trove Data Guide:
"""
# The initial startIdx value
start = 0
# Number of results per page, used to increment the startIdx value
n = 20
items = []
# If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
while n == 20:
url = f"https://nla.gov.au/{collection_id}/browse?startIdx={start}&rows=20&op=c"
# Get the browse page
response = s.get(url)
# Beautifulsoup turns the HTML into an easily navigable structure
soup = BeautifulSoup(response.text, "html.parser")
# Find all the divs containing issue details and loop through them
details = soup.find_all(class_="l-item-info")
for detail in details:
# Set a default type
item_type = "item"
# Look for the a tag with class "obj-reference content"
item_id = detail.find(
lambda tag: tag.name == "a"
and tag.get("class") == ["obj-reference", "content"]
)["href"].strip("/")
# Look for a link to 'children', indicating it's a subcollection (or a book or issue with pages)
has_children = detail.find(
lambda tag: tag.name == "a" and tag.get("class") == ["obj-reference"]
)
# If it has children, harvest items from the subcollection
if has_children and include_subcollections is True:
item_type = "collection"
items += harvest_collection_items(item_id, include_subcollections=True)
# Save the item
# The parent_id will enable us to identify items that are in subcollections
items.append(
{"item_id": item_id, "item_type": item_type, "parent_id": collection_id}
)
time.sleep(0.2)
# Increment the startIdx
start += n
# Set n to the number of results on the current page
n = len(details)
return items
def create_rocrate(collection_id, dir_path, start_date, end_date):
"""
Create an RO-Crate metadata file describing the downloaded dataset.
"""
crate = ROCrate()
crate.add_tree(dir_path)
nb_path = ipynbname.path()
nb = nbformat.read(nb_path, nbformat.NO_CONVERT)
metadata = nb.metadata.rocrate
nb_url = metadata.get("url", "")
nb_properties = {
"@type": ["File", "SoftwareSourceCode"],
"name": metadata.get("name", ""),
"description": metadata.get("description", ""),
"encodingFormat": "application/x-ipynb+json",
"codeRepository": metadata.get("codeRepository", ""),
"url": nb_url,
}
crate.add(ContextEntity(crate, nb_url, properties=nb_properties))
action_id = f"{nb_path.stem}_run"
action_properties = {
"@type": "CreateAction",
"instrument": {"@id": nb_url},
"actionStatus": {"@id": "http://schema.org/CompletedActionStatus"},
"name": f"Run of notebook: {nb_path.name}",
"result": {"@id": f"{dir_path.name}/"},
"query": collection_id,
"startDate": start_date,
"endDate": end_date,
}
crate.add(ContextEntity(crate, action_id, properties=action_properties))
for img in dir_path.glob("*.jpg"):
encoding = mimetypes.guess_type(img)[0]
stats = img.stat()
size = stats.st_size
date = datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d")
crate.update_jsonld(
{
"@id": f"images/{img.name}",
"dateModified": date,
"contentSize": size,
"encodingFormat": encoding,
}
)
crate.write(dir_path.parent)
crate.write_zip(dir_path.parent)
def download_image(item_id, dir_path, not_available):
file_path = Path(dir_path, f"{item_id}.jpg")
if not file_path.exists():
url = f"https://nla.gov.au/{item_id}/image"
response = s.get(url, stream=True)
# Exclude 404 responses and 'not available' images
if response.ok and response.content != not_available:
file_path.write_bytes(response.content)
time.sleep(1)
def download_images(collection_id, create_crate=True):
start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Set up a directory to save the images to
dir_path = Path("images", collection_id, "images")
dir_path.mkdir(exist_ok=True, parents=True)
# Load a 'not available' image to compare with what we download
# If the bytes match then we won't save it
not_available = Path("not_available.jpg").read_bytes()
# Get the image identifiers
items = harvest_collection_items(collection_id, include_subcollections=True)
for item in tqdm(items):
item_id = item["item_id"]
if item["item_type"] == "item":
download_image(item_id, dir_path, not_available)
if item["item_type"] == "collection":
# Sometimes items with children also have images that aren't included amongst the children!!
# We need to look at the embedded metadata to check for copies
metadata = get_work_data(item_id)
if "copies" in metadata:
download_image(item_id, dir_path, not_available)
end_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if create_crate is True:
create_rocrate(collection_id, dir_path, start_date, end_date)
display(
HTML(
f"Download dataset: <a href='images/{collection_id}.zip', download>images/{collection_id}.zip</a>"
)
)
download_images("nla.obj-2590820305")
# IGNORE THIS CELL -- TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
# ipynbname won't work in testing env, so don't create the crate
download_images("nla.obj-2590820305", create_crate=False)
Created by Tim Sherratt for the GLAM Workbench.