Merge pull request #152 from roboflow/dataset-upload

Upload Dataset
roboflow · Jun 21, 2023 · 35f38a4 · 35f38a4
2 parents 8694dfe + 66c4d58
commit 35f38a4
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 2 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -12,6 +12,7 @@ python-dateutil
 python-dotenv
 requests
 six
+supervision
 urllib3>=1.26.6
 wget
 tqdm>=4.41.0

diff --git a/roboflow/__init__.py b/roboflow/__init__.py
@@ -12,7 +12,7 @@
 from roboflow.core.workspace import Workspace
 from roboflow.util.general import write_line
 
-__version__ = "1.0.9"
+__version__ = "1.1.0"
 
 
 def check_key(api_key, model, notebook, num_retries=0):

diff --git a/roboflow/core/project.py b/roboflow/core/project.py
@@ -361,7 +361,7 @@ def __annotation_upload(
 
         # check if annotation file exists
         elif os.path.exists(annotation_path):
-            print("-> found given annotation file")
+            # print("-> found given annotation file")
             annotation_string = open(annotation_path, "r").read()
 
         # if not annotation file, check if user wants to upload regular as classification annotation

diff --git a/roboflow/core/workspace.py b/roboflow/core/workspace.py
@@ -1,11 +1,16 @@
+import concurrent.futures
 import glob
 import json
 import os
+import random
+import re
 import sys
 
 import requests
+import supervision as sv
 from numpy import ndarray
 from PIL import Image
+from tqdm import tqdm
 
 from roboflow.config import API_URL, CLIP_FEATURIZE_URL, DEMO_KEYS
 from roboflow.core.project import Project
@@ -15,6 +20,7 @@
     count_comparisons,
 )
 from roboflow.util.clip_compare_utils import clip_encode
+from roboflow.util.general import write_line
 from roboflow.util.two_stage_utils import ocr_infer
 
 
@@ -242,6 +248,79 @@ def two_stage_ocr(
 
         return results
 
+    def upload_dataset(
+        self,
+        dataset_path,
+        project_name,
+        num_workers=10,
+        dataset_format="yolov8",
+        project_license="MIT",
+        project_type="object-detection",
+    ):
+        if project_type != "object-detection":
+            raise ("upload_dataset only supported for object-detection projects")
+
+        if dataset_format not in ["voc", "yolov8", "yolov5"]:
+            raise (
+                "dataset_format not supported - please use voc, yolov8, yolov5. PS, you can always convert your dataset in the Roboflow UI"
+            )
+
+        # check type stuff and convert
+        if dataset_format == "yolov8" or dataset_format == "yolov5":
+            # convert to voc
+            for split in ["train", "valid", "test"]:
+                dataset = sv.DetectionDataset.from_yolo(
+                    images_directory_path=dataset_path + "/" + split + "/images",
+                    annotations_directory_path=dataset_path + "/" + split + "/labels",
+                    data_yaml_path=dataset_path + "/data.yaml",
+                )
+
+                dataset.as_pascal_voc(
+                    images_directory_path=dataset_path + "_voc" + "/" + split,
+                    annotations_directory_path=dataset_path + "_voc" + "/" + split,
+                )
+
+            dataset_path = dataset_path + "_voc"
+
+        if project_name in self.project_list:
+            dataset_upload_project = self.project(project_name)
+        else:
+            dataset_upload_project = self.create_project(
+                project_name,
+                project_license=project_license,
+                annotation=project_name,
+                project_type=project_type,
+            )
+
+        def upload_file(img_file, split):
+            label_file = img_file.replace(".jpg", ".xml")
+            dataset_upload_project.upload(
+                image_path=img_file, annotation_path=label_file, split=split
+            )
+
+        def parallel_upload(file_list, split):
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=num_workers
+            ) as executor:
+                list(
+                    tqdm(
+                        executor.map(upload_file, file_list, [split] * len(file_list)),
+                        total=len(file_list),
+                    )
+                )
+
+        write_line("uploading training set...")
+        file_list = glob.glob(dataset_path + "/train/*.jpg")
+        parallel_upload(file_list, "train")
+
+        write_line("uploading validation set...")
+        file_list = glob.glob(dataset_path + "/valid/*.jpg")
+        parallel_upload(file_list, "valid")
+
+        write_line("uploading test set...")
+        file_list = glob.glob(dataset_path + "/test/*.jpg")
+        parallel_upload(file_list, "test")
+
     def active_learning(
         self,
         raw_data_location: str = "",