Create a database to search across each line of text in a series of volumes¶
The code here was used to create the NSW Post Office Directories search interface which helps you search across 54 volumes from 1886 to 1950. The same code, with minor modifications, could be used to index any publication where it would be useful to search by line (rather than Trove's default 'article') – for example, lists, directories and gazetteers – turning them into searchable databases..
All you really need to get started is the top level identifier – that is, the Trove identifier for the publication series, under which individual volumes are listed. Where noted in the code, you'll also want to change a few names and details.
Because the harvesting process can be quite slow, and the amount of data processed quite large, you would probably want to run this notebook within a persistent environment rather than on Binder – either on your own computer, or in a cloud service like Reclaim Cloud or Nectar.
# Let's import the libraries we need.
import json
import re
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from natsort import natsorted, ns
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from sqlite_utils import Database
from tqdm.auto import tqdm
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
Get the data!¶
The first step is to download all of the OCRd text for each volume in the series. If they're large volumes this could take a long time.
def harvest_metadata(obj_id):
"""
This calls an internal API from a journal landing page to extract a list of available issues.
"""
# The initial startIdx value
start = 0
# Number of results per page
n = 20
issues = []
with tqdm(desc="Issues", leave=False) as pbar:
# If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
while n == 20:
# Get the browse page
response = s.get(
f"https://nla.gov.au/{obj_id}/browse?startIdx={start}&rows=20&op=c",
timeout=60,
)
# Beautifulsoup turns the HTML into an easily navigable structure
soup = BeautifulSoup(response.text, "lxml")
# Find all the divs containing issue details and loop through them
details = soup.find_all(class_="l-item-info")
for detail in details:
issue = {}
title = detail.find("h3")
if title:
issue["title"] = title.text
issue["id"] = title.parent["href"].strip("/")
else:
issue["title"] = "No title"
issue["id"] = detail.find("a")["href"].strip("/")
try:
# Get the issue details
issue["details"] = detail.find(
class_="obj-reference content"
).string.strip()
except (AttributeError, IndexError):
issue["details"] = "issue"
# Get the number of pages
try:
issue["pages"] = int(
re.search(
r"^(\d+)",
detail.find("a", attrs={"data-pid": issue["id"]}).text,
flags=re.MULTILINE,
).group(1)
)
except AttributeError:
issue["pages"] = 0
issues.append(issue)
# print(issue)
time.sleep(0.2)
# Increment the startIdx
start += n
# Set n to the number of results on the current page
n = len(details)
pbar.update(n)
return issues
def save_ocr(issues, obj_id, title=None, output_dir="directories"):
"""
Download the OCRd text for each page in each issue.
"""
if not title:
title = issues[0]["title"]
# output_path = os.path.join(output_dir, '{}-{}'.format(slugify(title)[:50], obj_id))
# output_path = Path(output_dir, f"{slugify(title)[:50]}-{obj_id}")
# output_path.mkdir(exist_ok=True)
# texts_path = os.path.join(output_path, 'texts')
# os.makedirs(texts_path, exist_ok=True)
for issue in tqdm(issues, desc="Texts", leave=False):
# Default values
if issue["pages"] != 0:
issue_name = f'{slugify(issue["title"])[:50]}-{slugify(issue["details"])}-{issue["id"]}'
issue_path = Path(output_dir, issue_name)
issue_path.mkdir(exist_ok=True, parents=True)
for page in tqdm(range(0, issue["pages"])):
# file_name = '{}-{}-{}.txt'.format(slugify(issue['title'])[:50], slugify(issue['details'])[:50], issue['id'])
file_path = Path(issue_path, f"{issue_name}-page{page}.txt")
# Check to see if the file has already been harvested
if not file_path.exists():
url = f'https://trove.nla.gov.au/{issue["id"]}/download?downloadOption=ocr&firstPage={page}&lastPage={page}'
# print(url)
# Get the file
r = s.get(url, timeout=120)
# Check there was no error
if r.status_code == requests.codes.ok:
# Check that the file's not empty
r.encoding = "utf-8"
if len(r.text) > 0 and not r.text.isspace():
# Check that the file isn't HTML (some not found pages don't return 404s)
if (
BeautifulSoup(r.text, "html.parser").find("html")
is None
):
# If everything's ok, save the file
with open(
file_path, "w", encoding="utf-8"
) as text_file:
text_file.write(r.text)
time.sleep(1)
Get a list of available issues for this title.
issues = harvest_metadata("nla.obj-522689844")
Save the OCRd text of each issue page by page. This means there'll be one text file for every page in every volume.
save_ocr(issues, "nla.obj-522689844")
Assemble the database!¶
Now we've got all the text, we can process each line and add everything into an SQLite database.
def get_work_metadata(obj_id):
"""
Extracts metadata embedded as a JSON string in a work's HTML page.
See: https://glam-workbench.net/trove-books/metadata-for-digital-works/
"""
# Get the HTML page
response = requests.get(f"https://nla.gov.au/{obj_id}")
# Search for the JSON string using regex
try:
work_data = re.search(
r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
).group(1)
except AttributeError:
# Just in case it's not there...
work_data = "{}"
print("No data found!")
# Return the JSON data
return json.loads(work_data)
db_path = Path("datasette-test")
db_path.mkdir(exist_ok=True)
# This the basic metadata file that will be used by Datasette
# The processing steps below will add details for each table/volume into the metadata file
# Obviously you'd modify this for a different publication!
metadata = {
"title": "New South Wales Post Office Directories",
"description_html": "<p>This is an experimental interface to the NSW Post Office Directories, which have been digitised from the collections of the State Library of NSW and are <a href='https://nla.gov.au/nla.obj-518308191'>now available on Trove</a>.</p><p>Searching for entries in the directories on Trove is difficult, because Trove's results are grouped by 'article', which is not very helpful for publications like these where information is organised by row. This interface searches for individual lines of text, rather than pages or articles. So it points you straight to entries of interest. Once you've found something, you can view the entry within the context of the complete page, or click back to Trove to explore further.</p><p>You can currently search across 15 volumes from 1886 to 1908.",
"databases": {
"post-office-directories": {
"title": "New South Wales Post Office Directories, 1886 to 1908",
"source": "Trove",
"source_url": "https://nla.gov.au/nla.obj-518308191",
"tables": {},
}
},
}
Now we'll process the data and add it to an SQLite database.
# Create the database
# Change the name as apporpriate!
db = Database(Path(db_path, "post-office-directories.db"))
# Create database tables for pages and volumes
page_table = db["pages"]
vols_table = db["volumes"]
# Loop through the directories of each volume created by the harvesting process (above)
# Use natsorted so that they're processed in date order
vols = natsorted(
[
d
for d in Path("directories").glob(
"the-new-south-wales-post-office-directory-collection*"
)
if d.is_dir()
]
)
for vol in vols:
print(vol)
# In the case of the PO Directories each volume has a different year (or year range) in the title.
# Here we're extracting the year and id, but extracting the year in this way might not work for other titles.
# The year is used as the title of the table in Datasette
year, obj_id = re.search(r"-([0-9\-]+?)-(nla.obj-\d+)", vol.name).groups()
# Add a record for this volume to the database
vols_table.insert({"vol_id": obj_id, "year": year}, pk="vol_id")
# Get the embedded JSON data for this volume and extract a list of pages
work_data = get_work_metadata(obj_id)
work_pages = work_data["children"]["page"]
# Get the title from the work metadata
title = work_data["title"]
# Update the metadata file with details of this volume
metadata["databases"]["post-office-directories"]["tables"][year] = {
"title": f"{title} {year}",
"source": "Trove",
"source_url": f"https://nla.gov.au/{obj_id}",
"searchmode": "raw",
}
# Create a table for this volume. For the PO directories I'm using the year as the table name.
# If year isn't available, some other way of naming the table would be necessary, such as the full title.
vol_table = db[year]
# Loop through all the text page by page for this volume
pages = natsorted([p for p in vol.glob("*") if p.is_file()], alg=ns.PATH)
for page in pages:
lines = []
# Insert details about this page into the pages table
page_num = int(re.search(r"-page(\d+)$", page.name).group(1))
page_table.insert(
{
"vol_id": obj_id,
"page_id": work_pages[page_num]["pid"],
"page": page_num,
},
pk=("vol_id", "page"),
foreign_keys=[("vol_id", "volumes")],
)
# Open the text file and loop through the lines
with page.open("r") as txt:
for num, line in enumerate(txt):
# Get rid of blank lines
line = line.replace("\n", "")
# If line is not blank, add details to a list of lines from this page
if line:
lines.append(
{
"page": page_num,
"line": num + 1,
"text": line,
"page_id": work_pages[page_num]["pid"],
}
)
# Insert all the lines from this page into the db
vol_table.insert_all(
lines, pk=("page", "line"), foreign_keys=[("page_id", "pages", "page_id")]
)
# Add a full text index on the line text
vol_table.enable_fts(["text"])
# Save the updated metadata file
with open(Path(db_path, "metadata.json"), "w") as md_file:
json.dump(metadata, md_file, indent=4)
Opening the db in Datasette¶
You should now have an SQLite database containing the indexed text of all the volumes in the collection. You can explore the database using Datasette. In a Python environment you'll need to install:
Then, from within the directory containing the database, run:
datasette post-office-directories.db -m metadata.json
To share your database publicly, look at Datasette's publishing options.
Created by Tim Sherratt for the GLAM Workbench as part of the Everyday Heritage project.