Explore harvested text files¶

In [1]:
import os
import zipfile
from pathlib import Path

import altair as alt
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("stopwords")
nltk.download("punkt")

stopwords = nltk.corpus.stopwords.words("english")
stopwords += ["tho", "tbe"]

# Are you using Jupyter Lab?
# If so either don't run this cell or comment out the line below

# alt.renderers.enable('notebook')

# If you forget, run this cell, and then get strange warnings when you make a chart,
# uncomment the following line and run this cell to reset the chart renderer

# alt.renderers.enable('default')

# alt.data_transformers.enable('json')
# nltk.download('stopwords')
# nltk.download('punkt')
# stopwords = nltk.corpus.stopwords.words('english')
[nltk_data] Downloading package stopwords to /home/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [2]:
%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv
In [ ]:
# Import a harvest zip file you've created previously
# First upload the zip file to the data directory, then run this cell

for zipped in sorted(Path("data").glob("*.zip")):
    print(f"Unzipping {zipped}...")
    with zipfile.ZipFile(zipped, "r") as zip_file:
        zip_file.extractall(Path(f"data/{zipped.stem}"))
In [3]:
def get_latest_harvest():
    """
    Get the timestamp of the most recent harvest.
    """
    harvests = sorted(
        [d for d in Path("data").iterdir() if d.is_dir() and not d.name.startswith(".")]
    )
    try:
        harvest = harvests[-1]
    except IndexError:
        print("No harvests!")
        harvest = None
    return harvest
In [4]:
def get_docs(harvest):
    docs_path = get_docs_path(harvest)
    for p in docs_path:
        yield p.read_text(encoding="utf-8").strip()


def get_docs_path(harvest):
    path = Path(harvest, "text")
    docs_path = [p for p in sorted(path.glob("*.txt"))]
    return docs_path


def get_file_names(harvest):
    return [p.stem for p in get_docs_path(harvest)]
In [5]:
# In testing environment, open a test harvest
if os.getenv("GW_STATUS") == "dev":
    harvest = Path("data", "1655952487")
# Otherwise open most recent harvest
# Supply a harvest directory name to open a specific harvest
else:
    harvest = get_latest_harvest()
In [6]:
vectorizer = CountVectorizer(
    stop_words=stopwords, max_features=10000, ngram_range=(1, 1)
)
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())
df_freq = pd.DataFrame(
    X_freq, columns=vectorizer.get_feature_names_out(), index=get_file_names(harvest)
)
In [7]:
df_freq.sum().nlargest(20)
Out[7]:
st          68466
street      62029
good        41017
rooms       39883
new         32997
apply       30852
mr          30242
co          28497
wanted      25910
10          25748
room        25505
house       25309
sale        24637
office      22547
per         21527
two         19313
terms       18843
one         18480
land        18250
brisbane    18066
dtype: int64
In [8]:
df_freq.unstack().to_frame().reset_index().dropna(axis=0, subset=[0])
Out[8]:
level_0 level_1 0
0 00 18541005-13-4798540 2
1 00 18550403-13-4806194 0
2 00 18561031-13-7139235 0
3 00 18571126-13-7142543 0
4 00 18580710-13-7297359 3
... ... ... ...
30659995 zu 19541112-969-204759481 0
30659996 zu 19541116-12-50619201 0
30659997 zu 19541119-470-135256155 0
30659998 zu 19870909-11-122120946 0
30659999 zu 19880228-11-101979292 0

30660000 rows × 3 columns

In [9]:
%%time
# The number of words you want to show
num_words = 10
top_words = pd.DataFrame(
    {
        n: df_freq.T[col].nlargest(num_words).index.tolist()
        for n, col in enumerate(df_freq.T)
    }
).T
top_words.index = get_file_names(harvest)
top_words.head()
CPU times: user 6.34 s, sys: 98 µs, total: 6.34 s
Wall time: 6.34 s
Out[9]:
0 1 2 3 4 5 6 7 8 9
18541005-13-4798540 mr street bo co melbourne per hotel day near mrs
18550403-13-4806194 john wm james mrs geo thos thomas henry miss jno
18561031-13-7139235 street nov mr sale apply land co near let east
18571126-13-7142543 machine made large messrs one year two iron prizes three
18580710-13-7297359 july 12 street sale clock sell co auction terms monday
In [10]:
df_freq.T
Out[10]:
18541005-13-4798540 18550403-13-4806194 18561031-13-7139235 18571126-13-7142543 18580710-13-7297359 18590407-13-5679082 18590520-13-5681431 18590524-809-154839403 18590812-67-60405583 18640227-13-5744865 ... 19530604-97-62492704 19530822-35-18381792 19531009-687-145667588 19531015-379-100665477 19540424-77-57316830 19541112-969-204759481 19541116-12-50619201 19541119-470-135256155 19870909-11-122120946 19880228-11-101979292
00 2 0 0 0 3 3 4 1 0 5 ... 0 13 0 0 0 0 0 0 0 0
000 3 11 5 3 25 12 35 8 0 18 ... 0 262 6 0 16 0 0 1 0 0
001 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
009 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
01 1 1 0 0 0 0 1 1 0 1 ... 0 6 0 1 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
yy 0 0 0 0 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
zealand 0 0 0 0 4 1 1 1 0 6 ... 0 1 0 0 0 0 0 0 0 0
zeehan 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
zinc 0 0 0 0 3 3 0 0 0 0 ... 0 4 0 0 0 0 0 0 0 0
zu 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

10000 rows × 3066 columns

Add a 'year' column to the dataframe¶

Each file name includes the date on which the article was published. For example, 18601224-13-5696044 was published on 24 December 1860. We can easily extract the year by just slicing the first four characters off the index.

In [11]:
df_freq["article_year"] = df_freq.index.str.slice(0, 4)

Most frequent words each year¶

In [12]:
# Group by year and sum the word counts
year_groups = df_freq.groupby(by="article_year")
year_group_totals = year_groups.sum()
In [13]:
# Reshape so that we have columns for year, word, and count
words_by_year = year_group_totals.unstack().to_frame().reset_index()
words_by_year.columns = ["word", "year", "count"]
In [14]:
top_words_by_year = (
    words_by_year.sort_values("count", ascending=False)
    .groupby(by=["year"])
    .head(10)
    .reset_index(drop=True)
)
In [15]:
top_words_by_year["word"].value_counts()[:25]
Out[15]:
word
street       55
mr           41
st           38
good         35
new          31
co           28
rooms        24
10           22
mrs          21
one          17
apply        17
sale         14
years        13
office       13
wanted       12
per          10
room         10
11           10
house         9
loving        8
would         8
brisbane      8
may           8
day           8
melbourne     7
Name: count, dtype: int64

Visualise top ten words per year¶

In [16]:
alt.Chart(top_words_by_year).mark_bar().encode(
    y=alt.Y("word:N", sort="-x"), x="count:Q", facet=alt.Facet("year", columns=4)
).properties(width=120, height=120).resolve_scale(x="independent", y="independent")
Out[16]:

Visualise word frequencies over time¶

Create a faceted chart¶

In [17]:
alt.Chart(
    words_by_year.loc[words_by_year["word"].isin(["storm", "cyclone", "snow"])]
).mark_line().encode(
    x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
    y="count:Q",
    color="word:N",
    facet=alt.Facet("word:N", columns=1),
).properties(
    width=700, height=100
).resolve_scale(
    y="independent"
)
Out[17]:

Created by Tim Sherratt (@wragge) for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.