import os
import re
import shutil
import time
from pathlib import Path

import pandas as pd
import requests
from dotenv import load_dotenv
from IPython.display import HTML
from PIL import UnidentifiedImageError
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
from trove_newspaper_images.articles import download_images

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

# Insert your Trove API key between the quotes
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

headers = {"X-API-KEY": API_KEY}

# Paste your list id between the quotes, and then run the cell
list_id = "83777"

# If you don't want to save all the OCRd text, change True to False below
save_texts = True

# Change this to True if you want to save PDFs of newspaper articles
save_pdfs = False

# Change this to False if you don't want to save images of newspaper articles
save_images = False

def get_url(identifiers, linktype):
    """
    Loop through the identifiers to find the requested url.
    """
    url = ""
    for identifier in identifiers:
        if identifier["linktype"] == linktype:
            url = identifier["value"]
            break
    return url


def save_as_csv(list_dir, data, data_type):
    df = pd.DataFrame(data)
    df.to_csv("{}/{}-{}.csv".format(list_dir, list_id, data_type), index=False)


def make_filename(article):
    """
    Create a filename for a text file or PDF.
    For easy sorting/aggregation the filename has the format:
        PUBLICATIONDATE-NEWSPAPERID-ARTICLEID
    """
    date = article["date"]
    date = date.replace("-", "")
    newspaper_id = article["newspaper_id"]
    article_id = article["id"]
    return "{}-{}-{}".format(date, newspaper_id, article_id)


def get_list(list_id):
    list_url = f"https://api.trove.nla.gov.au/v3/list/{list_id}?encoding=json&reclevel=full&include=listItems"
    response = s.get(list_url, headers=headers)
    return response.json()


def get_article(id):
    article_api_url = f"https://api.trove.nla.gov.au/v3/newspaper/{id}?encoding=json&reclevel=full&include=articletext"
    response = s.get(article_api_url, headers=headers)
    return response.json()


def make_dirs(list_id):
    list_dir = Path("data", "converted-lists", list_id)
    list_dir.mkdir(parents=True, exist_ok=True)
    Path(list_dir, "text").mkdir(exist_ok=True)
    Path(list_dir, "image").mkdir(exist_ok=True)
    Path(list_dir, "pdf").mkdir(exist_ok=True)
    return list_dir


def ping_pdf(ping_url):
    """
    Check to see if a PDF is ready for download.
    If a 200 status code is received, return True.
    """
    ready = False
    # req = Request(ping_url)
    try:
        # urlopen(req)
        response = s.get(ping_url, timeout=30)
        response.raise_for_status()
    except HTTPError:
        if response.status_code == 423:
            ready = False
        else:
            raise
    else:
        ready = True
    return ready


def get_pdf_url(article_id, zoom=3):
    """
    Download the PDF version of an article.
    These can take a while to generate, so we need to ping the server to see if it's ready before we download.
    """
    pdf_url = None
    # Ask for the PDF to be created
    prep_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-article{article_id}/level/{zoom}/prep"
    response = s.get(prep_url)
    # Get the hash
    prep_id = response.text
    # Url to check if the PDF is ready
    ping_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-article{article_id}.{zoom}.ping?followup={prep_id}"
    tries = 0
    ready = False
    time.sleep(2)  # Give some time to generate pdf
    # Are you ready yet?
    while ready is False and tries < 5:
        ready = ping_pdf(ping_url)
        if not ready:
            tries += 1
            time.sleep(2)
    # Download if ready
    if ready:
        pdf_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-article{article_id}.{zoom}.pdf?followup={prep_id}"
    return pdf_url


def harvest_list(list_id, save_text=True, save_pdfs=False, save_images=False):
    list_dir = make_dirs(list_id)
    data = get_list(list_id)
    works = []
    articles = []
    for item in tqdm(data["listItem"]):
        for zone, record in item.items():
            if zone == "work":
                work = {
                    "id": record.get("id", ""),
                    "title": record.get("title", ""),
                    "type": "|".join(record.get("type", [])),
                    "issued": record.get("issued", ""),
                    "contributor": "|".join(record.get("contributor", [])),
                    "trove_url": record.get("troveUrl", ""),
                    "fulltext_url": get_url(record.get("identifier", ""), "fulltext"),
                    "thumbnail_url": get_url(record.get("identifier", ""), "thumbnail"),
                }
                works.append(work)
            elif zone == "article":
                article = {
                    "id": record.get("id"),
                    "title": record.get("heading", ""),
                    "category": record.get("category", ""),
                    "date": record.get("date", ""),
                    "newspaper_id": record.get("title", {}).get("id"),
                    "newspaper_title": record.get("title", {}).get("title"),
                    "page": record.get("page", ""),
                    "page_sequence": record.get("pageSequence", ""),
                    "trove_url": f'http://nla.gov.au/nla.news-article{record.get("id")}',
                }
                full_details = get_article(record.get("id"))
                article["words"] = full_details.get("wordCount", "")
                article["illustrated"] = full_details.get("illustrated", "")
                article["corrections"] = full_details.get("correctionCount", "")
                if "trovePageUrl" in full_details:
                    page_id = re.search(
                        r"page(\d+)", full_details["trovePageUrl"]
                    ).group(1)
                    article["page_url"] = (
                        f"http://trove.nla.gov.au/newspaper/page/{page_id}"
                    )
                else:
                    article["page_url"] = ""
                filename = make_filename(article)
                if save_texts:
                    text = full_details.get("articleText")
                    text_file = Path(list_dir, "text", f"{filename}.txt")
                    if text:
                        text = re.sub(r"<[^<]+?>", "", text)
                        text = re.sub(r"\s\s+", " ", text)
                        text_file = Path(list_dir, "text", f"{filename}.txt")
                        with open(text_file, "wb") as text_output:
                            text_output.write(text.encode("utf-8"))
                if save_pdfs:
                    pdf_url = get_pdf_url(record["id"])
                    if pdf_url:
                        pdf_file = Path(list_dir, "pdf", f"{filename}.pdf")
                        response = s.get(pdf_url, stream=True)
                        with open(pdf_file, "wb") as pf:
                            for chunk in response.iter_content(chunk_size=128):
                                pf.write(chunk)
                if save_images:
                    images = []
                    tries = 0
                    # Trove has had some issues loading newspaper images lately
                    # This is an attempted workaround
                    while not images and tries < 2:
                        try:
                            images = download_images(
                                article["id"], Path(list_dir, "image"), masked=True
                            )
                        except UnidentifiedImageError:
                            time.sleep(5)
                            tries += 1

                articles.append(article)
    if articles:
        save_as_csv(list_dir, articles, "articles")
    if works:
        save_as_csv(list_dir, works, "works")
    return works, articles

works, articles = harvest_list(list_id, save_texts, save_pdfs, save_images)

# Preview newspaper articles CSV
df_articles = pd.DataFrame(articles)
df_articles

# Preview works CSV
df_works = pd.DataFrame(works)
df_works

list_dir = Path("data", "converted-lists", list_id)
shutil.make_archive(list_dir, "zip", list_dir)
HTML(f'<a download="{list_id}.zip" href="{list_dir}.zip">Download your harvest</a>')

Convert a Trove list into a CSV file¶

Set things up¶

Add your values to these two cells¶

Define some functions¶

Let's do it!¶

View the results¶

Download the results¶