Explore harvested text files¶
In [1]:
import os
import zipfile
from pathlib import Path
import altair as alt
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
nltk.download("stopwords")
nltk.download("punkt")
stopwords = nltk.corpus.stopwords.words("english")
stopwords += ["tho", "tbe"]
# Are you using Jupyter Lab?
# If so either don't run this cell or comment out the line below
# alt.renderers.enable('notebook')
# If you forget, run this cell, and then get strange warnings when you make a chart,
# uncomment the following line and run this cell to reset the chart renderer
# alt.renderers.enable('default')
# alt.data_transformers.enable('json')
# nltk.download('stopwords')
# nltk.download('punkt')
# stopwords = nltk.corpus.stopwords.words('english')
[nltk_data] Downloading package stopwords to /home/tim/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /home/tim/nltk_data... [nltk_data] Package punkt is already up-to-date!
In [2]:
%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv
In [ ]:
# Import a harvest zip file you've created previously
# First upload the zip file to the data directory, then run this cell
for zipped in sorted(Path("data").glob("*.zip")):
print(f"Unzipping {zipped}...")
with zipfile.ZipFile(zipped, "r") as zip_file:
zip_file.extractall(Path(f"data/{zipped.stem}"))
In [3]:
def get_latest_harvest():
"""
Get the timestamp of the most recent harvest.
"""
harvests = sorted(
[d for d in Path("data").iterdir() if d.is_dir() and not d.name.startswith(".")]
)
try:
harvest = harvests[-1]
except IndexError:
print("No harvests!")
harvest = None
return harvest
In [4]:
def get_docs(harvest):
docs_path = get_docs_path(harvest)
for p in docs_path:
yield p.read_text(encoding="utf-8").strip()
def get_docs_path(harvest):
path = Path(harvest, "text")
docs_path = [p for p in sorted(path.glob("*.txt"))]
return docs_path
def get_file_names(harvest):
return [p.stem for p in get_docs_path(harvest)]
In [5]:
# In testing environment, open a test harvest
if os.getenv("GW_STATUS") == "dev":
harvest = Path("data", "1655952487")
# Otherwise open most recent harvest
# Supply a harvest directory name to open a specific harvest
else:
harvest = get_latest_harvest()
In [6]:
vectorizer = CountVectorizer(
stop_words=stopwords, max_features=10000, ngram_range=(1, 1)
)
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())
df_freq = pd.DataFrame(
X_freq, columns=vectorizer.get_feature_names_out(), index=get_file_names(harvest)
)
In [7]:
df_freq.sum().nlargest(20)
Out[7]:
st 68466 street 62029 good 41017 rooms 39883 new 32997 apply 30852 mr 30242 co 28497 wanted 25910 10 25748 room 25505 house 25309 sale 24637 office 22547 per 21527 two 19313 terms 18843 one 18480 land 18250 brisbane 18066 dtype: int64
In [8]:
df_freq.unstack().to_frame().reset_index().dropna(axis=0, subset=[0])
Out[8]:
level_0 | level_1 | 0 | |
---|---|---|---|
0 | 00 | 18541005-13-4798540 | 2 |
1 | 00 | 18550403-13-4806194 | 0 |
2 | 00 | 18561031-13-7139235 | 0 |
3 | 00 | 18571126-13-7142543 | 0 |
4 | 00 | 18580710-13-7297359 | 3 |
... | ... | ... | ... |
30659995 | zu | 19541112-969-204759481 | 0 |
30659996 | zu | 19541116-12-50619201 | 0 |
30659997 | zu | 19541119-470-135256155 | 0 |
30659998 | zu | 19870909-11-122120946 | 0 |
30659999 | zu | 19880228-11-101979292 | 0 |
30660000 rows × 3 columns
In [9]:
%%time
# The number of words you want to show
num_words = 10
top_words = pd.DataFrame(
{
n: df_freq.T[col].nlargest(num_words).index.tolist()
for n, col in enumerate(df_freq.T)
}
).T
top_words.index = get_file_names(harvest)
top_words.head()
CPU times: user 6.34 s, sys: 98 µs, total: 6.34 s Wall time: 6.34 s
Out[9]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
18541005-13-4798540 | mr | street | bo | co | melbourne | per | hotel | day | near | mrs |
18550403-13-4806194 | john | wm | james | mrs | geo | thos | thomas | henry | miss | jno |
18561031-13-7139235 | street | nov | mr | sale | apply | land | co | near | let | east |
18571126-13-7142543 | machine | made | large | messrs | one | year | two | iron | prizes | three |
18580710-13-7297359 | july | 12 | street | sale | clock | sell | co | auction | terms | monday |
In [10]:
df_freq.T
Out[10]:
18541005-13-4798540 | 18550403-13-4806194 | 18561031-13-7139235 | 18571126-13-7142543 | 18580710-13-7297359 | 18590407-13-5679082 | 18590520-13-5681431 | 18590524-809-154839403 | 18590812-67-60405583 | 18640227-13-5744865 | ... | 19530604-97-62492704 | 19530822-35-18381792 | 19531009-687-145667588 | 19531015-379-100665477 | 19540424-77-57316830 | 19541112-969-204759481 | 19541116-12-50619201 | 19541119-470-135256155 | 19870909-11-122120946 | 19880228-11-101979292 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
00 | 2 | 0 | 0 | 0 | 3 | 3 | 4 | 1 | 0 | 5 | ... | 0 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
000 | 3 | 11 | 5 | 3 | 25 | 12 | 35 | 8 | 0 | 18 | ... | 0 | 262 | 6 | 0 | 16 | 0 | 0 | 1 | 0 | 0 |
001 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
009 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
01 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | ... | 0 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
yy | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
zealand | 0 | 0 | 0 | 0 | 4 | 1 | 1 | 1 | 0 | 6 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
zeehan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
zinc | 0 | 0 | 0 | 0 | 3 | 3 | 0 | 0 | 0 | 0 | ... | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
zu | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10000 rows × 3066 columns
Add a 'year' column to the dataframe¶
Each file name includes the date on which the article was published. For example, 18601224-13-5696044
was published on 24 December 1860. We can easily extract the year by just slicing the first four characters off the index.
In [11]:
df_freq["article_year"] = df_freq.index.str.slice(0, 4)
Most frequent words each year¶
In [12]:
# Group by year and sum the word counts
year_groups = df_freq.groupby(by="article_year")
year_group_totals = year_groups.sum()
In [13]:
# Reshape so that we have columns for year, word, and count
words_by_year = year_group_totals.unstack().to_frame().reset_index()
words_by_year.columns = ["word", "year", "count"]
In [14]:
top_words_by_year = (
words_by_year.sort_values("count", ascending=False)
.groupby(by=["year"])
.head(10)
.reset_index(drop=True)
)
In [15]:
top_words_by_year["word"].value_counts()[:25]
Out[15]:
word street 55 mr 41 st 38 good 35 new 31 co 28 rooms 24 10 22 mrs 21 one 17 apply 17 sale 14 years 13 office 13 wanted 12 per 10 room 10 11 10 house 9 loving 8 would 8 brisbane 8 may 8 day 8 melbourne 7 Name: count, dtype: int64
Visualise top ten words per year¶
In [16]:
alt.Chart(top_words_by_year).mark_bar().encode(
y=alt.Y("word:N", sort="-x"), x="count:Q", facet=alt.Facet("year", columns=4)
).properties(width=120, height=120).resolve_scale(x="independent", y="independent")
Out[16]:
Visualise word frequencies over time¶
Create a faceted chart¶
In [17]:
alt.Chart(
words_by_year.loc[words_by_year["word"].isin(["storm", "cyclone", "snow"])]
).mark_line().encode(
x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
y="count:Q",
color="word:N",
facet=alt.Facet("word:N", columns=1),
).properties(
width=700, height=100
).resolve_scale(
y="independent"
)
Out[17]:
Created by Tim Sherratt (@wragge) for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.