Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add checks to ensure required files and directories exist #54

Merged
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
pipenv install --dev

- name: Run tests
run: pipenv run pytest
run: pipenv run test

ruff:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ python_version = "3.12"

[scripts]
dev = "pipenv run python -m mediabridge.main"
test = "pipenv run pytest"
test = "pipenv run python -m pytest"
35 changes: 24 additions & 11 deletions mediabridge/data_processing/wiki_to_netflix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import csv
import logging
import os
import sys
import time
from dataclasses import dataclass
Expand All @@ -9,6 +8,8 @@
import requests
from tqdm import tqdm

from mediabridge.definitions import DATA_DIR, OUTPUT_DIR


class WikidataServiceTimeoutException(Exception):
pass
Expand All @@ -25,10 +26,7 @@ class MovieData:

# need Genres, Directors, Title, year?

data_dir = os.path.join(os.path.dirname(__file__), "../../data")
out_dir = os.path.join(os.path.dirname(__file__), "../../out")
user_agent = "Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>"

USER_AGENT = "Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>"
DEFAULT_TEST_ROWS = 100


Expand Down Expand Up @@ -220,15 +218,30 @@ def process_data(num_rows=None, output_missing_csv_path=None):
num_rows (int): Number of rows to process. If None, all rows are processed.
output_missing_csv_path (str): If provided, movies that could not be matched will be written to a CSV at this path.
"""

if not DATA_DIR.exists():
raise FileNotFoundError(
f"Data directory does not exist at {DATA_DIR}, please create a new directory containing the netflix prize dataset files\n"
"https://archive.org/details/nf_prize_dataset.tar"
skyfenton marked this conversation as resolved.
Show resolved Hide resolved
)

movie_data_path = DATA_DIR.joinpath("movie_titles.txt")

if not movie_data_path.exists():
raise FileNotFoundError(
f"{movie_data_path} not found, please download the netflix prize dataset and extract it into the data folder\n"
"https://archive.org/details/nf_prize_dataset.tar"
)

missing_count = 0
processed_data = []
missing = []

netflix_data = list(
read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), num_rows=num_rows)
)
netflix_data = list(read_netflix_txt(movie_data_path, num_rows))

netflix_csv = OUTPUT_DIR.joinpath("movie_titles.csv")

enriched_movies = wiki_query(netflix_data, user_agent)
enriched_movies = wiki_query(netflix_data, USER_AGENT)

num_rows = len(enriched_movies)

Expand Down Expand Up @@ -266,10 +279,10 @@ def process_data(num_rows=None, output_missing_csv_path=None):
]
processed_data.append(movie)

netflix_csv = os.path.join(out_dir, "movie_titles.csv")
netflix_csv = OUTPUT_DIR.joinpath("movie_titles.csv")
create_netflix_csv(netflix_csv, processed_data)
if output_missing_csv_path:
missing_csv = os.path.join(out_dir, output_missing_csv_path)
missing_csv = OUTPUT_DIR.joinpath(output_missing_csv_path)
create_netflix_csv(missing_csv, missing)

print(
Expand Down
4 changes: 3 additions & 1 deletion mediabridge/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
"File path definitions are incorrect, definitions.py is not in the root 'mediabridge' module."
)

MODULE_DIR = Path(__file__).absolute().parent
MODULE_DIR = Path(__file__).parent
PROJECT_DIR = MODULE_DIR.parent
DATA_DIR = PROJECT_DIR.joinpath("data")
OUTPUT_DIR = PROJECT_DIR.joinpath("out")

if __name__ == "__main__":
print(MODULE_DIR)
print(PROJECT_DIR)
print(DATA_DIR)
print(OUTPUT_DIR)
3 changes: 1 addition & 2 deletions mediabridge/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import os
from contextlib import nullcontext
from datetime import datetime

Expand Down Expand Up @@ -35,7 +34,7 @@ def main(
"-m",
help=(
f"If provided, movies that could not be matched will be written to a "
f"CSV at this path, relative to the {os.path.abspath(OUTPUT_DIR)} directory."
f"CSV at this path, relative to the {OUTPUT_DIR} directory."
),
),
):
Expand Down
Loading