Skip to content

Commit

Permalink
Merge pull request #54 from noisebridge/36-add-checks-to-ensure-requi…
Browse files Browse the repository at this point in the history
…red-filesdirectories-exist

Add checks to ensure required files and directories exist
  • Loading branch information
skyfenton authored Dec 11, 2024
2 parents d79c3ab + 96a998a commit 92f3ac0
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 16 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
pipenv install --dev
- name: Run tests
run: pipenv run pytest
run: pipenv run test

ruff:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ python_version = "3.12"

[scripts]
dev = "pipenv run python -m mediabridge.main"
test = "pipenv run pytest"
test = "pipenv run python -m pytest"
35 changes: 24 additions & 11 deletions mediabridge/data_processing/wiki_to_netflix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import csv
import logging
import os
import sys
import time
from dataclasses import dataclass
Expand All @@ -9,6 +8,8 @@
import requests
from tqdm import tqdm

from mediabridge.definitions import DATA_DIR, OUTPUT_DIR


class WikidataServiceTimeoutException(Exception):
pass
Expand All @@ -25,10 +26,7 @@ class MovieData:

# need Genres, Directors, Title, year?

data_dir = os.path.join(os.path.dirname(__file__), "../../data")
out_dir = os.path.join(os.path.dirname(__file__), "../../out")
user_agent = "Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>"

USER_AGENT = "Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>"
DEFAULT_TEST_ROWS = 100


Expand Down Expand Up @@ -220,15 +218,30 @@ def process_data(num_rows=None, output_missing_csv_path=None):
num_rows (int): Number of rows to process. If None, all rows are processed.
output_missing_csv_path (str): If provided, movies that could not be matched will be written to a CSV at this path.
"""

if not DATA_DIR.exists():
raise FileNotFoundError(
f"Data directory does not exist at {DATA_DIR}, please create a new directory containing the netflix prize dataset files\n"
"https://archive.org/details/nf_prize_dataset.tar"
)

movie_data_path = DATA_DIR.joinpath("movie_titles.txt")

if not movie_data_path.exists():
raise FileNotFoundError(
f"{movie_data_path} not found, please download the netflix prize dataset and extract it into the data folder\n"
"https://archive.org/details/nf_prize_dataset.tar"
)

missing_count = 0
processed_data = []
missing = []

netflix_data = list(
read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), num_rows=num_rows)
)
netflix_data = list(read_netflix_txt(movie_data_path, num_rows))

netflix_csv = OUTPUT_DIR.joinpath("movie_titles.csv")

enriched_movies = wiki_query(netflix_data, user_agent)
enriched_movies = wiki_query(netflix_data, USER_AGENT)

num_rows = len(enriched_movies)

Expand Down Expand Up @@ -266,10 +279,10 @@ def process_data(num_rows=None, output_missing_csv_path=None):
]
processed_data.append(movie)

netflix_csv = os.path.join(out_dir, "movie_titles.csv")
netflix_csv = OUTPUT_DIR.joinpath("movie_titles.csv")
create_netflix_csv(netflix_csv, processed_data)
if output_missing_csv_path:
missing_csv = os.path.join(out_dir, output_missing_csv_path)
missing_csv = OUTPUT_DIR.joinpath(output_missing_csv_path)
create_netflix_csv(missing_csv, missing)

print(
Expand Down
4 changes: 3 additions & 1 deletion mediabridge/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
"File path definitions are incorrect, definitions.py is not in the root 'mediabridge' module."
)

MODULE_DIR = Path(__file__).absolute().parent
MODULE_DIR = Path(__file__).parent
PROJECT_DIR = MODULE_DIR.parent
DATA_DIR = PROJECT_DIR.joinpath("data")
OUTPUT_DIR = PROJECT_DIR.joinpath("out")

if __name__ == "__main__":
print(MODULE_DIR)
print(PROJECT_DIR)
print(DATA_DIR)
print(OUTPUT_DIR)
3 changes: 1 addition & 2 deletions mediabridge/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import os
from contextlib import nullcontext
from datetime import datetime

Expand Down Expand Up @@ -35,7 +34,7 @@ def main(
"-m",
help=(
f"If provided, movies that could not be matched will be written to a "
f"CSV at this path, relative to the {os.path.abspath(OUTPUT_DIR)} directory."
f"CSV at this path, relative to the {OUTPUT_DIR} directory."
),
),
):
Expand Down

0 comments on commit 92f3ac0

Please sign in to comment.