import os
import shutil
from pathlib import Path
from zipfile import ZIP_DEFLATED
from zipfile import Path as ZipPath
from zipfile import ZipFile

import pandas as pd
from dotenv import load_dotenv
from natsort import natsorted
from trove_newspaper_harvester.core import Harvester, prepare_query

load_dotenv()

# Unzip the contents of the `zips` directory and save to the `data` directory
for zip in Path("zips").glob("*.zip"):
    zip_file = ZipFile(zip)
    if ZipPath(zip_file, at=f"{zip.stem}/").exists():
        zip_file.extractall("data")
    else:
        output_path = Path("data", zip.stem)
        output_path.mkdir(exist_ok=True)
        zip_file.extractall(output_path)

class HarvestSlicer:
    def __init__(self, harvest_id, data_dir="data", relevance_percent=None):
        """
        Initialise the Slicer with details of the harvest.
        """
        self.harvest_id = harvest_id
        self.data_path = Path(data_dir, harvest_id)
        self.text_path = Path(self.data_path, "text")
        self.relevance_percent = relevance_percent
        df = pd.read_csv(Path(self.data_path, "results.csv"), low_memory=False)
        df["year"] = df["date"].str.slice(0, 4)
        if relevance_percent:
            df = df.loc[
                df["relevance"] > df["relevance"].quantile(relevance_percent / 100)
            ]
        self.df = df

    def get_years(self):
        """
        Get a list of the years in which articles in the current harvest were published.
        """
        return sorted(self.df["year"].unique())

    def get_titles(self, year=""):
        """
        Get a list of the newspaper titles in which articles in the current harvest were published.
        """
        return sorted(self.df["newspaper_id"].unique())

    def get_top_titles(self, sample_size=10, measure="articles", period=None):
        """
        Get a list of the 'top' titles in which articles in the current harvest were published.
        'Top' can be measured by either the number of articles, or number of words.

        Parameters:
        - sample_size: number of titles to include
        - measure: how to determine the ranking, either 'articles' or 'words'
        - period: specify a year or decade
        """
        df = self.df.copy()
        if period:
            df = df.loc[df["date"].str.startswith(str(period))]
        if measure == "articles":
            sample = (
                df.groupby("newspaper_id")
                .size()
                .to_frame()
                .reset_index()
                .sort_values(0, ascending=False)[:sample_size]
            )
        elif measure == "words":
            sample = (
                df.groupby("newspaper_id")["words"]
                .sum()
                .to_frame()
                .reset_index()
                .sort_values("words", ascending=False)[:sample_size]
            )
        return sample["newspaper_id"].to_list()

    def slice_by_time_title(self, period=None, unit="year", title=None, save_as="zip"):
        """
        Slice the collection of harvested newspaper articles to create a subset using
        the supplied parameters.

        Parameters:
        - period: value of year or decade, eg: "1950"
        - unit: unit of time, either "year" or "decade"
        - title: newspaper title identifier
        - save_as: how to save the slice, either "zip" or "text"
        """
        relevance = ""
        if self.relevance_percent:
            relevance = f"-relevance-{self.relevance_percent}"
        if period and title:
            glob_pattern = f"{period}*-{title}-*.txt"
            filters = [unit, str(period), "title", str(title)]
            output_path = Path(self.data_path, f"{unit}-title{relevance}")
        elif period:
            glob_pattern = f"{period}*.txt"
            filters = [unit, str(period)]
            output_path = Path(self.data_path, f"{unit}{relevance}")
        elif title:
            glob_pattern = f"*-{title}-*.txt"
            filters = ["title", str(title)]
            output_path = Path(self.data_path, f"title{relevance}")
        else:
            return
        output_path.mkdir(exist_ok=True)
        # Save into a new zip file
        if save_as == "zip":
            zip_path = Path(output_path, f"{self.harvest_id}-{'-'.join(filters)}.zip")
            with ZipFile(zip_path, "w", ZIP_DEFLATED) as zip_file:
                for text_file in self.text_path.glob(glob_pattern):
                    zip_file.write(text_file, text_file.name)
        # Save as one big concatenated text file
        elif save_as == "text":
            with Path(output_path, f"{self.harvest_id}-{'-'.join(filters)}.txt").open(
                "w"
            ) as combined_text:
                for text_file in natsorted(self.text_path.glob(glob_pattern)):
                    combined_text.write(text_file.read_text())
                    combined_text.write("\n\n")

    def slice_titles(self, save_as="zip"):
        """
        Create slices for each newspaper title.

        Parameters:
        - save_as: how to save the slice, either "zip" or "text"
        """
        titles = self.get_titles()
        for title in titles:
            self.slice_by_time_title(title=title, save_as=save_as)

    def slice_top_titles(self, measure="articles", sample_size=10, save_as="zip"):
        """
        Create slices for 'top' newspaper titles.
        'Top' can be measured by either the number of articles, or number of words.

        Parameters:
        - sample_size: number of titles to include
        - measure: how to determine the ranking, either 'articles' or 'words'
        - save_as: how to save the slice, either "zip" or "text"
        """
        top_titles = self.get_top_titles(measure=measure, sample_size=sample_size)
        for title in top_titles:
            self.slice_by_time_title(title=title, save_as=save_as)

    def slice_years(self, save_as="zip"):
        """
        Create slices by year.

        Parameters:
        - save_as: how to save the slice, either "zip" or "text"
        """
        years = self.get_years()
        for year in years:
            self.slice_by_time_title(period=year, save_as=save_as)

    def slice_decades(self, save_as="zip"):
        """
        Create slices by decade.

        Parameters:
        - save_as: how to save the slice, either "zip" or "text"
        """
        years = self.get_years()
        decades = sorted(set([str(y)[:3] for y in years]))
        for decade in decades:
            self.slice_by_time_title(period=decade, unit="decade", save_as=save_as)

    def slice_years_titles(self, save_as="zip"):
        """
        Create slices for each combination of newspaper titles and year.

        Parameters:
        - save_as: how to save the slice, either "zip" or "text"
        """
        years = self.get_years()
        for year in years:
            titles = self.get_titles(year=year)
            for title in titles:
                self.slice_by_time_title(period=year, title=title, save_as=save_as)

    def slice_years_top_titles(self, measure="articles", sample_size=10, save_as="zip"):
        """
        Create slices for each combination of year and 'top' newspaper titles.
        'Top' can be measured by either the number of articles, or number of words.

        Parameters:
        - sample_size: number of titles to include
        - measure: how to determine the ranking, either 'articles' or 'words'
        - save_as: how to save the slice, either "zip" or "text"
        """
        years = self.get_years()
        for year in years:
            titles = self.get_top_titles(
                measure=measure, sample_size=sample_size, period=year
            )
            for title in titles:
                self.slice_by_time_title(period=year, title=title, save_as=save_as)

slicer = HarvestSlicer("[Your Harvest ID]")

# Substitute your harvest identifier below
slicer = HarvestSlicer("[Your Harvest ID]")

# Substitute your harvest identifier below
# Change relevance_percent to your desired cutoff point
slicer = HarvestSlicer("[Your Harvest ID]", relevance_percent=50)

slicer.slice_years()

slicer.slice_years(save_as="text")

slicer.slice_decades()

slicer.slice_decades(save_as="text")

slicer.slice_titles()

slicer.slice_titles(save_as="text")

slicer.slice_top_titles()

slicer.slice_top_titles(measure="words", sample_size=20, save_as="text")

slicer.slice_years_titles()

slicer.slice_years_titles(save_as="text")

slicer.slice_years_top_titles()

slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text")

def filter_results(
    harvest_id,
    data_dir="data",
    relevance_percent=None,
    year=None,
    title=None,
    add_text=True,
    fields=["title", "date", "page", "newspaper_title", "url"],
):
    """
    Filter an existing results set by year and/or title, adding the OCRd text of each
    individual article to a new `full_text` column.

    Parameters:
    - harvest_id: identifier of the harvest to filter
    - data_dir: location of the harvest (default is 'data')
    - relevance_percent: relevance score cut off
    - year: eg '1950'
    - title: Trove newspaper title identifier, eg '11'
    - add_text: add OCRd text to CSV (default is True)
    - fields: list of fields to include in the resulting CSV

    Result:
    - saves the results as a CSV file
    """
    data_path = Path(data_dir, harvest_id)
    df = pd.read_csv(Path(data_path, "results.csv"), low_memory=False)
    filters = []
    if relevance_percent:
        df = df.loc[df["relevance"] > df["relevance"].quantile(relevance_percent / 100)]
        print(df["relevance"].quantile(relevance_percent / 100))
        filters += ["relevance", str(relevance_percent)]
    if year and title:
        df = df.loc[
            (df["date"].str.startswith(str(year))) & (df["newspaper_id"] == int(title))
        ]
        filters += ["year", str(year), "title", str(title)]
    elif year:
        df = df.loc[df["date"].str.startswith(str(year))]
        filters += ["year", str(year)]
    elif title:
        df = df.loc[df["newspaper_id"] == int(title)]
        filters += ["title", str(title)]
    if add_text:
        df["full_text"] = df["text"].apply(lambda x: Path(data_path, x).read_text())
        fields.append("full_text")
        filters.append("text")
    df[fields].to_csv(
        Path(data_path, f"filtered-results-{'-'.join(filters)}.csv"), index=False
    )

# Insert your harvest identifier between the quotes
filter_results("[Your harvest id]", year=1950)

filter_results("[Your harvest id]", title=11)

filter_results("[Your harvest id]", year=1950, title=11)

# Insert your harvest identifier between the quotes
filter_results("[Your harvest id]", relevance_percent=50, add_text=False)

# Insert your harvest identifier between the quotes
filter_results("[Your harvest id]", relevance_percent=50)

# IGNORE CELL -- TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
    API_KEY = os.getenv("TROVE_API_KEY")
    query = "https://trove.nla.gov.au/search/category/newspapers?keyword=%22octopus%20intelligence%22"
    params = prepare_query(query=query)
    harvester = Harvester(query_params=params, key=API_KEY, text=True)
    harvester.harvest()
    harvester.save_csv()
    harvest_id = harvester.harvest_dir.name
    slicer = HarvestSlicer(harvest_id)
    slicer.slice_titles()
    slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text")
    slicer.slice_years()
    filter_results(harvest_id, year=1946)
    slicer = HarvestSlicer(harvest_id, relevance_percent=50)
    slicer.slice_titles()
    slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text")
    slicer.slice_years()
    filter_results(harvest_id, year=1946, relevance_percent=50)
    shutil.rmtree(harvester.harvest_dir)

Reshaping your newspaper harvest¶

Upload an existing harvest¶

Code for the HarvestSlicer¶

Using the Harvest Slicer¶

Slicing by decade or year¶

Slice by newspaper¶

Slice by both year and newspaper¶

Create a CSV file with a subset of results¶