import os
import zipfile
from pathlib import Path

import altair as alt
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("stopwords")
nltk.download("punkt")

stopwords = nltk.corpus.stopwords.words("english")
stopwords += ["tho", "tbe"]

# Are you using Jupyter Lab?
# If so either don't run this cell or comment out the line below

# alt.renderers.enable('notebook')

# If you forget, run this cell, and then get strange warnings when you make a chart,
# uncomment the following line and run this cell to reset the chart renderer

# alt.renderers.enable('default')

# alt.data_transformers.enable('json')
# nltk.download('stopwords')
# nltk.download('punkt')
# stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv

# Import a harvest zip file you've created previously
# First upload the zip file to the data directory, then run this cell

for zipped in sorted(Path("data").glob("*.zip")):
    print(f"Unzipping {zipped}...")
    with zipfile.ZipFile(zipped, "r") as zip_file:
        zip_file.extractall(Path(f"data/{zipped.stem}"))

def get_latest_harvest():
    """
    Get the timestamp of the most recent harvest.
    """
    harvests = sorted(
        [d for d in Path("data").iterdir() if d.is_dir() and not d.name.startswith(".")]
    )
    try:
        harvest = harvests[-1]
    except IndexError:
        print("No harvests!")
        harvest = None
    return harvest

def get_docs(harvest):
    docs_path = get_docs_path(harvest)
    for p in docs_path:
        yield p.read_text(encoding="utf-8").strip()


def get_docs_path(harvest):
    path = Path(harvest, "text")
    docs_path = [p for p in sorted(path.glob("*.txt"))]
    return docs_path


def get_file_names(harvest):
    return [p.stem for p in get_docs_path(harvest)]

# In testing environment, open a test harvest
if os.getenv("GW_STATUS") == "dev":
    harvest = Path("data", "1655952487")
# Otherwise open most recent harvest
# Supply a harvest directory name to open a specific harvest
else:
    harvest = get_latest_harvest()

vectorizer = CountVectorizer(
    stop_words=stopwords, max_features=10000, ngram_range=(1, 1)
)
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())
df_freq = pd.DataFrame(
    X_freq, columns=vectorizer.get_feature_names_out(), index=get_file_names(harvest)
)

df_freq.sum().nlargest(20)

st          68466
street      62029
good        41017
rooms       39883
new         32997
apply       30852
mr          30242
co          28497
wanted      25910
10          25748
room        25505
house       25309
sale        24637
office      22547
per         21527
two         19313
terms       18843
one         18480
land        18250
brisbane    18066
dtype: int64

df_freq.unstack().to_frame().reset_index().dropna(axis=0, subset=[0])

%%time
# The number of words you want to show
num_words = 10
top_words = pd.DataFrame(
    {
        n: df_freq.T[col].nlargest(num_words).index.tolist()
        for n, col in enumerate(df_freq.T)
    }
).T
top_words.index = get_file_names(harvest)
top_words.head()

CPU times: user 6.34 s, sys: 98 µs, total: 6.34 s
Wall time: 6.34 s

df_freq.T

df_freq["article_year"] = df_freq.index.str.slice(0, 4)

# Group by year and sum the word counts
year_groups = df_freq.groupby(by="article_year")
year_group_totals = year_groups.sum()

# Reshape so that we have columns for year, word, and count
words_by_year = year_group_totals.unstack().to_frame().reset_index()
words_by_year.columns = ["word", "year", "count"]

top_words_by_year = (
    words_by_year.sort_values("count", ascending=False)
    .groupby(by=["year"])
    .head(10)
    .reset_index(drop=True)
)

top_words_by_year["word"].value_counts()[:25]

word
street       55
mr           41
st           38
good         35
new          31
co           28
rooms        24
10           22
mrs          21
one          17
apply        17
sale         14
years        13
office       13
wanted       12
per          10
room         10
11           10
house         9
loving        8
would         8
brisbane      8
may           8
day           8
melbourne     7
Name: count, dtype: int64

alt.Chart(top_words_by_year).mark_bar().encode(
    y=alt.Y("word:N", sort="-x"), x="count:Q", facet=alt.Facet("year", columns=4)
).properties(width=120, height=120).resolve_scale(x="independent", y="independent")

alt.Chart(
    words_by_year.loc[words_by_year["word"].isin(["storm", "cyclone", "snow"])]
).mark_line().encode(
    x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
    y="count:Q",
    color="word:N",
    facet=alt.Facet("word:N", columns=1),
).properties(
    width=700, height=100
).resolve_scale(
    y="independent"
)

	level_0	level_1	0
0	00	18541005-13-4798540	2
1	00	18550403-13-4806194	0
2	00	18561031-13-7139235	0
3	00	18571126-13-7142543	0
4	00	18580710-13-7297359	3
...	...	...	...
30659995	zu	19541112-969-204759481	0
30659996	zu	19541116-12-50619201	0
30659997	zu	19541119-470-135256155	0
30659998	zu	19870909-11-122120946	0
30659999	zu	19880228-11-101979292	0

	0	1	2	3	4	5	6	7	8	9
18541005-13-4798540	mr	street	bo	co	melbourne	per	hotel	day	near	mrs
18550403-13-4806194	john	wm	james	mrs	geo	thos	thomas	henry	miss	jno
18561031-13-7139235	street	nov	mr	sale	apply	land	co	near	let	east
18571126-13-7142543	machine	made	large	messrs	one	year	two	iron	prizes	three
18580710-13-7297359	july	12	street	sale	clock	sell	co	auction	terms	monday

Explore harvested text files¶

Add a 'year' column to the dataframe¶

Most frequent words each year¶

Visualise top ten words per year¶

Visualise word frequencies over time¶

Create a faceted chart¶

	18541005-13-4798540	18550403-13-4806194	18561031-13-7139235	18571126-13-7142543	18580710-13-7297359	18590407-13-5679082	18590520-13-5681431	18590524-809-154839403	18590812-67-60405583	18640227-13-5744865	...	19530604-97-62492704	19530822-35-18381792	19531009-687-145667588	19531015-379-100665477	19540424-77-57316830	19541112-969-204759481	19541116-12-50619201	19541119-470-135256155	19870909-11-122120946	19880228-11-101979292
00	2	0	0	0	3	3	4	1	0	5	...	0	13	0	0	0	0	0	0	0	0
000	3	11	5	3	25	12	35	8	0	18	...	0	262	6	0	16	0	0	1	0	0
001	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
009	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
01	1	1	0	0	0	0	1	1	0	1	...	0	6	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
yy	0	0	0	0	1	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
zealand	0	0	0	0	4	1	1	1	0	6	...	0	1	0	0	0	0	0	0	0	0
zeehan	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
zinc	0	0	0	0	3	3	0	0	0	0	...	0	4	0	0	0	0	0	0	0	0
zu	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0