vocabulary-csv-exporter/run.py

import csv
import os
import re


def remove_links(definitions: str) -> str:
    definitions = re.sub(r"\[\[([^\|\]\]]*)\]\]", r"\1", definitions)
    definitions = re.sub(r"\[\[[^\[\[]*\|([^\|\]\]]*)\]\]", r"\1", definitions)
    return definitions


def replace_asterisks_with_italics(text: str) -> str:
    is_odd = True
    result = ""

    for char in text:
        if char == "*":
            if is_odd:
                result += "<i>"
            else:
                result += "</i>"
            is_odd = not is_odd
        else:
            result += char

    return result


def extract_example_sentences(content: str) -> str:
    sentences = [line[2:].strip() for line in content.split("\n")[1:] if line.strip()]
    joined = "\n".join([f"<li>{replace_asterisks_with_italics(sentence)}</li>" for sentence in sentences])
    return f"<ul>{joined}</ul>"


def read_markdown_file(file_path: str, filename: str) -> tuple[str, str, str] | None:
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        word = os.path.splitext(filename)[0]

        content_parts = content.split("??")
        if len(content_parts) <= 1:
            raise Exception("No delimiter found")

        definitions = content_parts[1]

        examples = extract_example_sentences(content_parts[0])

        filtered_lines = []
        for line in definitions.split("\n"):
            line = line.strip()
            if line.startswith(("n.", "adj.", "adv.", "v.", "prep.")):
                filtered_lines.append(line)

        definitions = "\n".join(filtered_lines)
        definitions = remove_links(definitions)
        return word, definitions, examples
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None


def walk_directory(directory=".") -> list[tuple[str, str, str]]:
    """
    Recursively reads all .md files in the given directory and its subdirectories.
    Prints the filename and contents of each file.

    Args:
        directory (str): The directory to start searching from. Defaults to current directory.
    """
    word_definitions = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if not file.endswith(".md"):
                continue

            file_path = os.path.join(root, file)
            result = read_markdown_file(file_path, file)
            if not result:
                continue

            word, definitions, examples = result
            word_definitions.append((word, definitions, examples))

    return word_definitions


if __name__ == "__main__":
    word_definitions = walk_directory(".")
    # Sort word_definitions alphabetically by word (first element of each tuple)
    word_definitions.sort(key=lambda x: x[0])

    # Write to CSV file
    with open("word_definitions.csv", "w", encoding="utf-8", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        # Write header row
        # Write each word and its definitions to the CSV file
        for word, definitions, examples in word_definitions:
            csv_writer.writerow([word, definitions, examples])

    print(len(word_definitions))

    # print(f"Successfully wrote {len(word_definitions)} words to word_definitions.csv")
    # # Write to text file
    # with open("word_definitions.txt", "w", encoding="utf-8") as textfile:
    #     # Write each word and its definitions to the text file
    #     for word, definitions in word_definitions:
    #         textfile.write(f"{word},{definitions}\n\n")