import json
import os
from pathlib import Path

import pandas as pd
import sqlite_utils
from IPython.display import HTML, display
from jupyter_server import serverapp

def get_latest_harvest():
    """
    Get the timestamp of the most recent harvest.
    """
    harvests = sorted(
        [d for d in os.listdir("data") if os.path.isdir(os.path.join("data", d))]
    )
    return harvests[-1]


def open_harvest_data(timestamp=None):
    """
    Open the results of the specified harvest (most recent by default).
    Returns a list of records and a timestamp.
    """
    if not timestamp:
        timestamp = get_latest_harvest()
    df = pd.read_csv(Path("data", timestamp, "results.csv"))
    return df.to_dict("records")


def create_db(timestamp=None):
    """
    Create a db named with the supplied timestamp (or the timestamp of the latest harvest).
    Load the CSV data from the harvest.
    """
    if not timestamp:
        timestamp = get_latest_harvest()
    db_path = Path("data", timestamp, "results.db")
    # Delete an existing db
    db_path.unlink(missing_ok=True)
    # Create a new db
    # Get the harvest data
    data = open_harvest_data(timestamp)
    # Create the db
    db = sqlite_utils.Database(db_path)
    # Load the data, specifying `article_id` as the primary key
    db["records"].insert_all(data, pk="article_id")
    return db_path


def get_db(timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    db_path = Path("data", timestamp, "results.db")
    if not db_path.exists:
        db_path = create_db(timestamp)
    return db_path


def get_proxy_url():
    # Get current running servers
    servers = serverapp.list_running_servers()
    base_url = next(servers)["base_url"]
    """
    try:
        # Get the current base url
        base_url = next(servers)["base_url"]
    except StopIteration:
        # Binder uses notebook server
        servers = notebookapp.list_running_servers()
        base_url = next(servers)["base_url"]
    """
    # Create a base url for Datasette using the proxy path
    proxy_url = f"{base_url}proxy/8001/"
    return proxy_url


def open_datasette(timestamp=None):
    """
    This gets the base url of the currently running notebook. It then uses this url to
    construct a link to your Datasette instance, using jupyter-server-proxy.
    Finally it creates a button to open up a new tab to view your database.
    """
    if not timestamp:
        timestamp = get_latest_harvest()
    db_path = str(get_db(timestamp))  # noqa: F841
    proxy_url = get_proxy_url()
    # Display a link to Datasette
    display(
        HTML(
            f'<p><a style="width: 200px; display: block; border: 1px solid #307fc1; background-color: #1976d2; color: #ffffff; padding: 10px; text-align: center; font-weight: bold;"href="{proxy_url}">View in Datasette</a> (Click on the stop button in the top menu bar to close the Datasette server)</p>'
        )
    )
    # Launch Datasette
    !datasette -- {db_path} --port 8001 --config base_url:{proxy_url} --config truncate_cells_html:100 --setting facet_suggest_time_limit_ms 100 --setting facet_time_limit_ms 1000 --setting sql_time_limit_ms 5000 --metadata metadata.yml

def add_text_to_db(timestamp=None):
    """
    Add harvested text files to a db.
    """
    if not timestamp:
        timestamp = get_latest_harvest()
    db_path = get_db(timestamp)
    db = sqlite_utils.Database(db_path)
    # Add a text column to the db
    db["records"].add_column("fulltext", str)
    for row in db["records"].rows:
        db["records"].update(
            row["article_id"],
            {"fulltext": Path("data", timestamp, row["text"]).read_text()},
        )
    # Make the text column full text searchable
    db["records"].enable_fts(["fulltext"])
    db["records"].optimize()

def add_images_to_db(timestamp=None):
    """
    Add harvested images to a db.
    """
    if not timestamp:
        timestamp = get_latest_harvest()
    db_path = get_db(timestamp)
    db = sqlite_utils.Database(db_path)
    db["records"].add_column("image_preview", str)
    db["records"].add_column("image_path", str)
    proxy_url = get_proxy_url()

    for row in db["records"].rows:
        if row["images"]:
            full_path = Path("data", timestamp, row["images"].split("|")[0])
            # Add the image file path
            db["records"].update(row["article_id"], {"image_path": str(full_path)})
            # Add some JSON with the url to the image (via the media server and Jupyter proxy)
            db["records"].update(
                row["article_id"],
                {
                    "image_preview": json.dumps(
                        {
                            "img_src": f"{proxy_url}-/media/thumbnail/{row['article_id']}",
                            "href": f"{proxy_url}-/media/large/{row['article_id']}",
                        }
                    )
                },
            )

# Open Datasette
create_db()
add_text_to_db()
add_images_to_db()
open_datasette()

Display the results of a harvest as a searchable database using Datasette¶

Import what we need¶

Load harvested data into Datasette¶

Add OCRd text and make it searchable¶

Add image links to database¶

Open Datasette¶

Now what?¶