# Let's import the libraries we need.
import json
import re
import time
from datetime import timedelta
from pathlib import Path

import arrow
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

# Insert the periodical's nla.obj identifier
periodical_id = "nla.obj-8423556"

# Optionally set a range of years
start_year = None
end_year = None

def get_issue_ids(periodical_id):
    # The initial startIdx value
    start = 0
    # Number of results per page, used to increment the startIdx value
    n = 20

    items = []
    with tqdm() as pbar:
        # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
        while n == 20:
            url = f"https://nla.gov.au/{periodical_id}/browse?startIdx={start}&rows=20&op=c"

            # Get the browse page
            response = s.get(url)

            # Beautifulsoup turns the HTML into an easily navigable structure
            soup = BeautifulSoup(response.text, "html.parser")

            # Find all the divs containing issue details and loop through them
            details = soup.find_all(class_="l-item-info")
            for detail in details:
                # Look for the a tag with class "obj-reference content"
                item_id = detail.find(
                    lambda tag: tag.name == "a"
                    and tag.get("class") == ["obj-reference", "content"]
                )["href"].strip("/")

                # Save the issue id
                items.append(item_id)
            if not response.from_cache:
                time.sleep(0.2)
            # Increment the startIdx
            start += n
            # Set n to the number of results on the current page
            n = len(details)
            pbar.update(n)
    return items

issue_ids = get_issue_ids(periodical_id)

def get_metadata(id):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    response = s.get(id)
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        work_data = "{}"
    if not response.from_cache:
        time.sleep(0.2)
    return json.loads(work_data)


def get_issue_data(issue_ids):
    issues = []

    for issue_id in tqdm(issue_ids):
        metadata = get_metadata(issue_id)
        date = metadata.get("issueDate", "")
        try:
            iso_date = arrow.get(date, "ddd, D MMM YYYY").format("YYYY-MM-DD")
        except arrow.parser.ParserMatchError:
            iso_date = ""
        issue = {
            "id": issue_id,
            "date": date,
            "iso_date": iso_date,
            "details": metadata.get("subUnitNo", ""),
            "pages": len(metadata["children"]["page"]),
        }
        issues.append(issue)

    return issues

issues = get_issue_data(issue_ids)

def filter_issues(issues, start_year=None, end_year=None):
    filtered = []
    if not (start_year or end_year):
        return issues
    for issue in issues:
        year = issue["iso_date"][:4]
        if year:
            year = int(year)
            if start_year and end_year:
                if year >= start_year and year <= end_year:
                    filtered.append(issue)
            elif start_year:
                if year >= start_year:
                    filtered.append(issue)
            elif end_year:
                if year <= end_year:
                    filtered.append(issue)
    return filtered


def download_pdfs(issues, start_year=None, end_year=None):
    output_dir = Path("pdfs", periodical_id)
    output_dir.mkdir(exist_ok=True, parents=True)

    for issue in tqdm(filter_issues(issues, start_year, end_year)):
        pdf_url = f"https://nla.gov.au/{issue['id']}/download?downloadOption=pdf&firstPage=0&lastPage={issue['pages']-1}"
        response = s.get(pdf_url, stream=True)
        if issue["iso_date"]:
            filename = f"{issue['iso_date']}-{issue['id']}.pdf"
        Path(output_dir, filename).write_bytes(response.content)
        if not response.from_cache:
            time.sleep(1)

download_pdfs(issues, start_year, end_year)

name_parts = [str(p) for p in [periodical_id, "issues", start_year, end_year] if p]
csv_filename = f"{'-'.join(name_parts)}.csv"
df = pd.DataFrame(issues)
df.to_csv(Path("pdfs", periodical_id, csv_filename, index=False))

Download issues of a periodical as PDFs¶

Set your parameters¶

Get issue identifiers¶

Get number of pages in each issue¶

Download PDFs¶

Save metadata¶