Files
vocabulary-csv-exporter/run.py

75 lines
2.3 KiB
Python

import csv
import os
import re
def remove_links(definitions: str) -> str:
definitions = re.sub(r"\[\[([^\|\]\]]*)\]\]", r"\1", definitions)
definitions = re.sub(r"\[\[[^\[\[]*\|([^\|\]\]]*)\]\]", r"\1", definitions)
return definitions
def read_markdown_file(file_path: str, filename: str) -> tuple[str, str] | None:
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
word = os.path.splitext(filename)[0]
content_parts = content.split("??")
if len(content_parts) > 1:
definitions = content_parts[1]
else:
raise Exception("No delimiter found")
filtered_lines = []
for line in definitions.split("\n"):
line = line.strip()
if line.startswith(("n.", "adj.", "adv.", "v.")):
filtered_lines.append(line)
definitions = "\n".join(filtered_lines)
definitions = remove_links(definitions)
return word, definitions
except Exception as e:
print(f"Error reading {file_path}: {e}")
return None
def walk_directory(directory=".") -> list[tuple[str, str]]:
"""
Recursively reads all .md files in the given directory and its subdirectories.
Prints the filename and contents of each file.
Args:
directory (str): The directory to start searching from. Defaults to current directory.
"""
word_definitions = []
for root, dirs, files in os.walk(directory):
for file in files:
if not file.endswith(".md"):
continue
file_path = os.path.join(root, file)
result = read_markdown_file(file_path, file)
if not result:
continue
word, definitions = result
word_definitions.append((word, definitions))
return word_definitions
if __name__ == "__main__":
word_definitions = walk_directory("words/PartB")
# Sort word_definitions alphabetically by word (first element of each tuple)
word_definitions.sort(key=lambda x: x[0])
# Write to text file
with open("word_definitions.txt", "w", encoding="utf-8") as textfile:
# Write each word and its definitions to the text file
for word, definitions in word_definitions:
textfile.write(f"{word},{definitions}\n\n")
print(len(word_definitions))