Reshaping your newspaper harvest¶
The Trove Newspaper Harvester downloads the OCRd text of newspaper articles as individual text files – one file for each article. That's great for exploring the content of individual articles in depth, but sometimes you might want to zoom out and aggregate the files into larger chunks. For example, if you're interested in how language changes over time, you might what to create a separate corpus for each year in the results set. Or perhaps you want to examine differences in the way particular newspapers talk about an event by grouping the articles by newspaper. This notebook provides a slice and dice wonder tool for Trove newspaper harvests, enabling you to repackage OCRd text by decade, year, and newspaper title. It saves the results as zip files, concatenated text files, or CSV files with embedded text. These repackaged slices should suit a variety of text analysis tools and questions.
import os
import shutil
from pathlib import Path
from zipfile import ZIP_DEFLATED
from zipfile import Path as ZipPath
from zipfile import ZipFile
import pandas as pd
from dotenv import load_dotenv
from natsort import natsorted
from trove_newspaper_harvester.core import Harvester, prepare_query
load_dotenv()
Upload an existing harvest¶
If you want to reshape a dataset downloaded from a previous run of the Trove Newspaper Harvester, upload the zipped dataset file to the zips
directory:
- double click the
zips
folder to open it - click on the upload icon to select your existing dataset
Once the file has been uploaded to the zips
directory, run the cell below to unpack the contents into the data
directory.
# Unzip the contents of the `zips` directory and save to the `data` directory
for zip in Path("zips").glob("*.zip"):
zip_file = ZipFile(zip)
if ZipPath(zip_file, at=f"{zip.stem}/").exists():
zip_file.extractall("data")
else:
output_path = Path("data", zip.stem)
output_path.mkdir(exist_ok=True)
zip_file.extractall(output_path)
Code for the HarvestSlicer¶
class HarvestSlicer:
def __init__(self, harvest_id, data_dir="data", relevance_percent=None):
"""
Initialise the Slicer with details of the harvest.
"""
self.harvest_id = harvest_id
self.data_path = Path(data_dir, harvest_id)
self.text_path = Path(self.data_path, "text")
self.relevance_percent = relevance_percent
df = pd.read_csv(Path(self.data_path, "results.csv"), low_memory=False)
df["year"] = df["date"].str.slice(0, 4)
if relevance_percent:
df = df.loc[
df["relevance"] > df["relevance"].quantile(relevance_percent / 100)
]
self.df = df
def get_years(self):
"""
Get a list of the years in which articles in the current harvest were published.
"""
return sorted(self.df["year"].unique())
def get_titles(self, year=""):
"""
Get a list of the newspaper titles in which articles in the current harvest were published.
"""
return sorted(self.df["newspaper_id"].unique())
def get_top_titles(self, sample_size=10, measure="articles", period=None):
"""
Get a list of the 'top' titles in which articles in the current harvest were published.
'Top' can be measured by either the number of articles, or number of words.
Parameters:
- sample_size: number of titles to include
- measure: how to determine the ranking, either 'articles' or 'words'
- period: specify a year or decade
"""
df = self.df.copy()
if period:
df = df.loc[df["date"].str.startswith(str(period))]
if measure == "articles":
sample = (
df.groupby("newspaper_id")
.size()
.to_frame()
.reset_index()
.sort_values(0, ascending=False)[:sample_size]
)
elif measure == "words":
sample = (
df.groupby("newspaper_id")["words"]
.sum()
.to_frame()
.reset_index()
.sort_values("words", ascending=False)[:sample_size]
)
return sample["newspaper_id"].to_list()
def slice_by_time_title(self, period=None, unit="year", title=None, save_as="zip"):
"""
Slice the collection of harvested newspaper articles to create a subset using
the supplied parameters.
Parameters:
- period: value of year or decade, eg: "1950"
- unit: unit of time, either "year" or "decade"
- title: newspaper title identifier
- save_as: how to save the slice, either "zip" or "text"
"""
relevance = ""
if self.relevance_percent:
relevance = f"-relevance-{self.relevance_percent}"
if period and title:
glob_pattern = f"{period}*-{title}-*.txt"
filters = [unit, str(period), "title", str(title)]
output_path = Path(self.data_path, f"{unit}-title{relevance}")
elif period:
glob_pattern = f"{period}*.txt"
filters = [unit, str(period)]
output_path = Path(self.data_path, f"{unit}{relevance}")
elif title:
glob_pattern = f"*-{title}-*.txt"
filters = ["title", str(title)]
output_path = Path(self.data_path, f"title{relevance}")
else:
return
output_path.mkdir(exist_ok=True)
# Save into a new zip file
if save_as == "zip":
zip_path = Path(output_path, f"{self.harvest_id}-{'-'.join(filters)}.zip")
with ZipFile(zip_path, "w", ZIP_DEFLATED) as zip_file:
for text_file in self.text_path.glob(glob_pattern):
zip_file.write(text_file, text_file.name)
# Save as one big concatenated text file
elif save_as == "text":
with Path(output_path, f"{self.harvest_id}-{'-'.join(filters)}.txt").open(
"w"
) as combined_text:
for text_file in natsorted(self.text_path.glob(glob_pattern)):
combined_text.write(text_file.read_text())
combined_text.write("\n\n")
def slice_titles(self, save_as="zip"):
"""
Create slices for each newspaper title.
Parameters:
- save_as: how to save the slice, either "zip" or "text"
"""
titles = self.get_titles()
for title in titles:
self.slice_by_time_title(title=title, save_as=save_as)
def slice_top_titles(self, measure="articles", sample_size=10, save_as="zip"):
"""
Create slices for 'top' newspaper titles.
'Top' can be measured by either the number of articles, or number of words.
Parameters:
- sample_size: number of titles to include
- measure: how to determine the ranking, either 'articles' or 'words'
- save_as: how to save the slice, either "zip" or "text"
"""
top_titles = self.get_top_titles(measure=measure, sample_size=sample_size)
for title in top_titles:
self.slice_by_time_title(title=title, save_as=save_as)
def slice_years(self, save_as="zip"):
"""
Create slices by year.
Parameters:
- save_as: how to save the slice, either "zip" or "text"
"""
years = self.get_years()
for year in years:
self.slice_by_time_title(period=year, save_as=save_as)
def slice_decades(self, save_as="zip"):
"""
Create slices by decade.
Parameters:
- save_as: how to save the slice, either "zip" or "text"
"""
years = self.get_years()
decades = sorted(set([str(y)[:3] for y in years]))
for decade in decades:
self.slice_by_time_title(period=decade, unit="decade", save_as=save_as)
def slice_years_titles(self, save_as="zip"):
"""
Create slices for each combination of newspaper titles and year.
Parameters:
- save_as: how to save the slice, either "zip" or "text"
"""
years = self.get_years()
for year in years:
titles = self.get_titles(year=year)
for title in titles:
self.slice_by_time_title(period=year, title=title, save_as=save_as)
def slice_years_top_titles(self, measure="articles", sample_size=10, save_as="zip"):
"""
Create slices for each combination of year and 'top' newspaper titles.
'Top' can be measured by either the number of articles, or number of words.
Parameters:
- sample_size: number of titles to include
- measure: how to determine the ranking, either 'articles' or 'words'
- save_as: how to save the slice, either "zip" or "text"
"""
years = self.get_years()
for year in years:
titles = self.get_top_titles(
measure=measure, sample_size=sample_size, period=year
)
for title in titles:
self.slice_by_time_title(period=year, title=title, save_as=save_as)
Using the Harvest Slicer¶
To create a new Harvest Slicer run:
slicer = HarvestSlicer("[Your Harvest ID]")
Substitute your harvest's identifier for [Your Harvest ID]
. The harvest identifier is the name of the directory containing your harvest. This will usually be a string of numbers representing the date/time when the harvest was started. For example:
slicer = HarvestSlicer("20240522025457")
By default, harvests are saved in the data
directory. If your harvest is in a different directory, you need to supply a data_dir
parameter set to the directory name. For example:
slicer = HarvestSlicer("20240522025457", data_dir="myharvests")
By default, the HarvestSlicer will operate on all the results in the harvested dataset. However, you might want to do some initial filtering by making use of relevance scores. The relevance scores are calculated by Trove's search index and take into account things like where and the number of times your keywords appear in an article. Use the relevance_percent
parameter to specify a cut-off value for inclusion. For example, if you set relevance_percent
to 50
only articles with relevance scores in the top 50% of scores will be included in your dataset:
slicer = HarvestSlicer("20240522025457", relevance_percent=50)
Enter your harvest id below and run the cell to create a Harvest Slicer.
# Substitute your harvest identifier below
slicer = HarvestSlicer("[Your Harvest ID]")
Optionally filter your dataset by relevance score.
# Substitute your harvest identifier below
# Change relevance_percent to your desired cutoff point
slicer = HarvestSlicer("[Your Harvest ID]", relevance_percent=50)
Slicing by decade or year¶
You can create slices of harvested articles by year or decade. These slices can be saved as either a zip file containing the individual text files, or as one big text file containing the contents of each article – one article per line.
The slices will be saved in a directories named year
or decade
. Each slice is named using the harvest identifier and the year or decade. For example, if you sliced the 20240522025457
harvest by year, you'd end up with a year
directory cont|aining files like 20240522025457-year-1950.txt
.
Slice by year and save the results as zip files (the default):
slicer.slice_years()
Slice by year and save the results as concatenated text files:
slicer.slice_years(save_as="text")
Slice by decade and save the results as zip files (the default):
slicer.slice_decades()
Slice by decade and save the results as concatenated text files:
slicer.slice_decades(save_as="text")
Slice by newspaper¶
You can create slices of harvested articles according to the newspaper in which they were published. These slices can be saved as either a zip file containing the individual text files, or as one big text file containing the contents of each article – one article per line.
There are close to 2,000 different newspapers in Trove. To limit the number of slices you can choose to only save articles from the 'top' newspapers in the dataset. Top is measured by looking at either the total number of articles, or the total number of words in articles. You can choose how many newspapers in the ranked 'top' list to include.
The slices will be saved in a directory named title
. Each slice is named using the harvest identifier and the newspaper identifier. For example, if you sliced the 20240522025457
harvest by title, you'd end up with a title
directory containing files like 20240522025457-title-11.txt
(11
is Trove's identifier of the Canberra Times).
Slice by newspaper title and save the results as zip files (the default):
slicer.slice_titles()
Slice by title and save the results as concatenated text files:
slicer.slice_titles(save_as="text")
Slice by newspaper title and save results from the 10 newspapers with the most articles as zip files (the default settings):
slicer.slice_top_titles()
Slice by newspaper title and save results from the 20 newspapers with the most words as concatenated text files:
slicer.slice_top_titles(measure="words", sample_size=20, save_as="text")
Slice by both year and newspaper¶
You can create slices of harvested articles from each newspaper, published in each year. This means there'll be a slice for each combination of title and year. These slices can be saved as either a zip file containing the individual text files, or as one big text file containing the contents of each article – one article per line.
To limit the number of slices you can choose to only save articles from the 'top' newspapers in the dataset. Top is measured by looking at either the total number of articles, or the total number of words in articles. You can choose how many newspapers in the ranked 'top' list to include.
The slices will be saved in a directory named year-title
. Each slice is named using the harvest identifier, the year, and the newspaper identifier. For example, if you sliced the 20240522025457
harvest by year and title, you'd end up with a year-title
directory containing files like 20240522025457-year-1950-title-11.txt
(11
is Trove's identifier of the Canberra Times).
Slice by year and newspaper title and save the results as zip files (the default):
slicer.slice_years_titles()
Slice by title and save the results as concatenated text files:
slicer.slice_years_titles(save_as="text")
Slice by year and newspaper title and save results from the 10 newspapers with the most articles as zip files (the default settings):
slicer.slice_years_top_titles()
Slice by year and newspaper title and save results from the 20 newspapers with the most words as concatenated text files:
slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text")
Create a CSV file with a subset of results¶
The HarvestSlicer
creates new collections of OCRd text. For some purposes it might be more useful to create a subset of the harvested metadata in the results.csv
file, adding the OCRd text into new CSV file.
The filter_results()
function creates a new CSV file with a subset of the original results, filtering by year and/or newspaper title. By default, it will also add the OCRd text from each article to a new full_text
column, and filter the columns in the dataset to include only title
, date
, page
, newspaper_title
, url
, and full_text
. Both these defaults can be changed.
The resulting CSV files are saved in the harvest directory. For example, a dataset that was filtered to include results from 1950 published in the Canberra Times (id is '11') would be saved as: filtered-results-year-1950-title-11.csv
.
def filter_results(
harvest_id,
data_dir="data",
relevance_percent=None,
year=None,
title=None,
add_text=True,
fields=["title", "date", "page", "newspaper_title", "url"],
):
"""
Filter an existing results set by year and/or title, adding the OCRd text of each
individual article to a new `full_text` column.
Parameters:
- harvest_id: identifier of the harvest to filter
- data_dir: location of the harvest (default is 'data')
- relevance_percent: relevance score cut off
- year: eg '1950'
- title: Trove newspaper title identifier, eg '11'
- add_text: add OCRd text to CSV (default is True)
- fields: list of fields to include in the resulting CSV
Result:
- saves the results as a CSV file
"""
data_path = Path(data_dir, harvest_id)
df = pd.read_csv(Path(data_path, "results.csv"), low_memory=False)
filters = []
if relevance_percent:
df = df.loc[df["relevance"] > df["relevance"].quantile(relevance_percent / 100)]
print(df["relevance"].quantile(relevance_percent / 100))
filters += ["relevance", str(relevance_percent)]
if year and title:
df = df.loc[
(df["date"].str.startswith(str(year))) & (df["newspaper_id"] == int(title))
]
filters += ["year", str(year), "title", str(title)]
elif year:
df = df.loc[df["date"].str.startswith(str(year))]
filters += ["year", str(year)]
elif title:
df = df.loc[df["newspaper_id"] == int(title)]
filters += ["title", str(title)]
if add_text:
df["full_text"] = df["text"].apply(lambda x: Path(data_path, x).read_text())
fields.append("full_text")
filters.append("text")
df[fields].to_csv(
Path(data_path, f"filtered-results-{'-'.join(filters)}.csv"), index=False
)
Create a filtered results set containing articles published in 1950:
# Insert your harvest identifier between the quotes
filter_results("[Your harvest id]", year=1950)
Create a filtered results set containing articles published in the Canberra Times:
filter_results("[Your harvest id]", title=11)
Create a filtered results set containing articles published in 1950 in the Canberra Times:
filter_results("[Your harvest id]", year=1950, title=11)
Filter the results set using the relevance scores of articles, saving results with scores in the top 50%, without adding the OCRd text to the CSV file.
# Insert your harvest identifier between the quotes
filter_results("[Your harvest id]", relevance_percent=50, add_text=False)
Filter the results set using the relevance scores of articles, saving results with scores in the top 50%, and adding the OCRd text to the CSV file.
# Insert your harvest identifier between the quotes
filter_results("[Your harvest id]", relevance_percent=50)
# IGNORE CELL -- TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
API_KEY = os.getenv("TROVE_API_KEY")
query = "https://trove.nla.gov.au/search/category/newspapers?keyword=%22octopus%20intelligence%22"
params = prepare_query(query=query)
harvester = Harvester(query_params=params, key=API_KEY, text=True)
harvester.harvest()
harvester.save_csv()
harvest_id = harvester.harvest_dir.name
slicer = HarvestSlicer(harvest_id)
slicer.slice_titles()
slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text")
slicer.slice_years()
filter_results(harvest_id, year=1946)
slicer = HarvestSlicer(harvest_id, relevance_percent=50)
slicer.slice_titles()
slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text")
slicer.slice_years()
filter_results(harvest_id, year=1946, relevance_percent=50)
shutil.rmtree(harvester.harvest_dir)
Created by Tim Sherratt for the GLAM Workbench.