# Let's import the libraries we need.
import os
import re
import shutil
import time
from datetime import timedelta
from pathlib import Path

import pandas as pd
import requests
import requests_cache
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError, Timeout
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from tqdm.auto import tqdm

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

def check_for_file(issue, texts_path):
    issue_date = issue["date"] if issue["date"] else "nd"
    file_name = f"{issue_date}-{issue['id']}.txt"
    file_path = Path(texts_path, file_name)
    if file_path.exists():
        return file_name
    return ""


def download_issue(issue_id, last_page, file_path):
    url = f"https://trove.nla.gov.au/{issue_id}/download?downloadOption=ocr&firstPage=0&lastPage={last_page}"
    # print(url)
    # Get the file
    try:
        response = s.get(url, timeout=180)
    except (Timeout, ConnectionError) as err:
        print(f"{type(err).__name__}: {url}")
    else:
        # Check there was no error
        if response.status_code == requests.codes.ok:
            # Check that the file's not empty
            response.encoding = "utf-8"
            # Check that the file isn't HTML (some not found pages don't return 404s)
            # BS is too lax and will pass text files that happen to have html tags in them
            # if BeautifulSoup(r.text, "html.parser").find("html") is None:
            if (
                len(response.text) > 0
                and not response.text.isspace()
                and not re.search(r"</HTML>", response.text, re.IGNORECASE)
            ):
                file_path.write_text(response.text)
            if not response.from_cache:
                time.sleep(0.5)


def download_all_issues(df, output_dir="periodicals"):
    # Group issues by title, then loop trhough the titles/issues
    for title, issues in tqdm(df.groupby(["title_id", "title"])):
        output_path = Path(output_dir, f"{slugify(title[1])[:50]}-{title[0]}")
        texts_path = Path(output_path, "texts")
        texts_path.mkdir(exist_ok=True, parents=True)
        issues_with_pages = issues.loc[issues["pages"] > 0]
        for issue in tqdm(
            issues_with_pages.itertuples(),
            total=issues_with_pages.shape[0],
            leave=False,
        ):
            last_page = issue.pages - 1
            issue_date = issue.date if issue.date else "nd"
            file_name = f"{issue_date}-{issue.id}.txt"
            file_path = Path(texts_path, file_name)
            # Check to see if the file has already been harvested
            if not file_path.exists():
                download_issue(issue.id, last_page, file_path)
        issues["text_file"] = issues.apply(check_for_file, args=(texts_path,), axis=1)
        issues.to_csv(Path(output_path, "issues.csv"), index=False)

# Load issues dataset
df = pd.read_csv(
    "https://github.com/GLAM-Workbench/trove-periodicals-data/raw/main/periodical-issues.csv",
    keep_default_na=False,
)

# Download texts
download_all_issues(df)

# IGNORE THIS CELL -- FOR TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
    df = pd.read_csv(
        "https://github.com/GLAM-Workbench/trove-periodicals-data/raw/main/periodical-issues.csv",
        keep_default_na=False,
    )

download_all_issues(df[:10], output_dir="texts_test")
shutil.rmtree("texts_test")

Download the OCRd text for ALL the digitised periodicals in Trove!¶

Setting things up¶