# Let's import the libraries we need.
import glob
import os
import re
import shutil
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from tqdm.auto import tqdm

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

# Replace the value in the single quotes with the identifier of your chosen journal
journal_id = "nla.obj-320790312"

def harvest_metadata(obj_id):
    """
    This calls an internal API from a journal landing page to extract a list of available issues.
    """
    start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
    # The initial startIdx value
    start = 0
    # Number of results per page
    n = 20
    issues = []
    with tqdm(desc="Issues", leave=False) as pbar:
        # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
        while n == 20:
            # Get the browse page
            response = s.get(start_url.format(obj_id, start), timeout=60)
            # Beautifulsoup turns the HTML into an easily navigable structure
            soup = BeautifulSoup(response.text, "lxml")
            # Find all the divs containing issue details and loop through them
            details = soup.find_all(class_="l-item-info")
            for detail in details:
                issue = {}
                title = detail.find("h3")
                if title:
                    issue["title"] = title.text
                    issue["id"] = title.parent["href"].strip("/")
                else:
                    issue["title"] = "No title"
                    issue["id"] = detail.find("a")["href"].strip("/")
                try:
                    # Get the issue details
                    issue["details"] = detail.find(
                        class_="obj-reference content"
                    ).string.strip()
                except (AttributeError, IndexError):
                    issue["details"] = "issue"
                # Get the number of pages
                try:
                    issue["pages"] = int(
                        re.search(
                            r"^(\d+)",
                            detail.find("a", attrs={"data-pid": issue["id"]}).text,
                            flags=re.MULTILINE,
                        ).group(1)
                    )
                except AttributeError:
                    issue["pages"] = 0
                issues.append(issue)
                # print(issue)
                time.sleep(0.2)
            # Increment the startIdx
            start += n
            # Set n to the number of results on the current page
            n = len(details)
            pbar.update(n)
    return issues


def save_ocr(issues, obj_id, title=None, output_dir="journals"):
    """
    Download the OCRd text for each issue.
    """
    processed_issues = []
    if not title:
        title = issues[0]["title"]
    output_path = os.path.join(output_dir, "{}-{}".format(slugify(title)[:50], obj_id))
    texts_path = os.path.join(output_path, "texts")
    os.makedirs(texts_path, exist_ok=True)
    for issue in tqdm(issues, desc="Texts", leave=False):
        # Default values
        issue["text_file"] = ""
        if issue["pages"] != 0:
            # print(book['title'])
            # The index value for the last page of an issue will be the total pages - 1
            last_page = issue["pages"] - 1
            file_name = "{}-{}-{}.txt".format(
                slugify(issue["title"])[:50],
                slugify(issue["details"])[:50],
                issue["id"],
            )
            file_path = os.path.join(texts_path, file_name)
            # Check to see if the file has already been harvested
            if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
                # print('Already saved')
                issue["text_file"] = file_name
            else:
                url = "https://trove.nla.gov.au/{}/download?downloadOption=ocr&firstPage=0&lastPage={}".format(
                    issue["id"], last_page
                )
                # print(url)
                # Get the file
                r = s.get(url, timeout=120)
                # Check there was no error
                if r.status_code == requests.codes.ok:
                    # Check that the file's not empty
                    r.encoding = "utf-8"
                    if len(r.text) > 0 and not r.text.isspace():
                        # Check that the file isn't HTML (some not found pages don't return 404s)
                        if BeautifulSoup(r.text, "html.parser").find("html") is None:
                            # If everything's ok, save the file
                            with open(file_path, "w", encoding="utf-8") as text_file:
                                text_file.write(r.text)
                            issue["text_file"] = file_name
                time.sleep(1)
        processed_issues.append(issue)
    df = pd.DataFrame(processed_issues)
    # Remove empty directories
    try:
        os.rmdir(texts_path)
        os.rmdir(output_path)
    except OSError:
        # It's not empty, so add list of issues
        df.to_csv(
            os.path.join(output_path, "{}-issues.csv".format(obj_id)), index=False
        )

issues = harvest_metadata(journal_id)

save_ocr(issues, journal_id)

journal_dir = glob.glob(os.path.join("journals", "*-{}".format(journal_id)))[0]
shutil.make_archive(journal_dir, "zip", journal_dir)
display(HTML("<b>Download results</b>"))
display(
    HTML(
        f'<a download="{os.path.basename(journal_dir)}.zip" href="{journal_dir}.zip">{journal_dir}.zip</a>'
    )
)

df = pd.read_csv(
    os.path.join(journal_dir, "{}-issues.csv".format(journal_id)), keep_default_na=False
)
df.head()

num_issues = df.shape[0]
num_text = df.loc[df["text_file"] != ""].shape[0]
print("{} / {} issues have OCRd text".format(num_issues, num_text))

Get OCRd text from a digitised journal in Trove¶

Import what we need¶

What journal do you want?¶

Define some functions to do the work¶

Get a list of issues¶

Download the OCRd texts¶

View and download the results¶