feat: Add script to download latest papers from BioRxiv and MedRxiv s…

…ervers
thegraphnetwork-literev · Feb 12, 2024 · 1e9bd2b · 1e9bd2b
1 parent 7630397
commit 1e9bd2b
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 2 deletions.
diff --git a/.makim.yaml b/.makim.yaml
@@ -11,7 +11,17 @@ groups:
 
   scheduler:
     targets:
-      download-biorxiv-medrxiv:
+      fetch_date:
+        shell: bash
+        run: |
+          #!/bin/bash
+          most_recent_file=$(ls -1 data/rxivx/biorxiv/ | sort -r | head -n 1)
+          latest_date=$(echo $most_recent_file | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}' | tail -1)
+          current_date=$(date +'%Y-%m-%d')
+          output_filename="biorxiv_${latest_date}_${current_date}.json"
+          echo $output_filename
+
+      download-rxivr:
         help: |
           Download data from BioRxiv/MedRxiv API within a specified date range.  
         args:
@@ -22,11 +32,14 @@ groups:
           begin:
             help: Specify the start date in YYYY-MM-DD format
             type: string
+            # default: {{ start_date }}
             required: true
           end:
             help: Specify the end date in YYYY-MM-DD format
             type: string
+            # default: {{ current_date }}
             required: true
+
           target:
             help: Path to save the JSON file (e.g., 'data/')
             type: string

diff --git a/README.md b/README.md
@@ -7,7 +7,23 @@ A service for rXiv REST API, such as biorxiv and medrxiv
 
 ## Features
 
-TBD
+# SRC App
+
+This application is designed to fetch the latest papers from the BioRxiv and MedRxiv servers. It automates the process of downloading the latest papers, merging them with existing data, and ensuring that the data is up to date.
+
+## Installation
+
+...
+
+## Usage
+
+Run the following command to fetch the latest papers from the desired server:
+   ```
+   ./fetch_rxiv_data.sh <server_name>
+   ```
+   Replace `<server_name>` with either "biorxiv" or "medrxiv" based on the server you want to fetch the papers from.
+
+---
 
 ## Credits
 

diff --git a/src/data/rxivx/biorxiv/downloaded/biorxiv_2024-02-10_2024-02-08.json b/src/data/rxivx/biorxiv/downloaded/biorxiv_2024-02-10_2024-02-08.json
@@ -0,0 +1,5 @@
+[
+  {
+    "doi": "10.1101/12345678"
+  }
+]
diff --git a/src/data/rxivx/biorxiv/final/biorxiv_full_data.json b/src/data/rxivx/biorxiv/final/biorxiv_full_data.json
@@ -0,0 +1,5 @@
+[
+    {
+        "doi": "10.1101/12345678"
+    }
+]
diff --git a/src/data/rxivx/medrxiv/downloaded/medrxiv_2024-01-01_2024-02-07.json b/src/data/rxivx/medrxiv/downloaded/medrxiv_2024-01-01_2024-02-07.json
@@ -0,0 +1,5 @@
+[
+  {
+    "doi": "10.1101/12345678"
+  }
+]
diff --git a/src/data/rxivx/medrxiv/final/medrxiv_full_data.json b/src/data/rxivx/medrxiv/final/medrxiv_full_data.json
@@ -0,0 +1,5 @@
+[
+    {
+        "doi": "10.1101/12345678"
+    }
+]
diff --git a/src/fetch_rxivx_data.sh b/src/fetch_rxivx_data.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Get the server name from the command-line argument
+server=$1
+
+most_recent_file=$(ls -1 data/rxivx/${server}/downloaded | sort -r | head -n 1)
+latest_date=$(echo $most_recent_file | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}' | tail -1)
+current_date=$(date +'%Y-%m-%d')
+output_filename="${server}_${latest_date}_${current_date}.json"
+
+echo "Downloading ${output_filename} database"
+
+makim scheduler.download-rxivr --server ${server} --begin ${latest_date} --end ${current_date} --target data/rxivx/${server}/downloaded/
+
+python merge_arxiv_data.py "data/rxivx/${server}/final/${server}_full_data.json" "data/rxivx/${server}/downloaded/${output_filename}"
diff --git a/src/merge_arxiv_data.py b/src/merge_arxiv_data.py
@@ -0,0 +1,37 @@
+import json
+import sys
+
+def merge_json_files(existing_file_path: str, new_file_path: str) -> None:
+    """
+    Merge the data from a new JSON file into an existing JSON file, avoiding duplicates based on the 'doi' field.
+
+    Args:
+        existing_file_path (str): The path to the existing JSON file.
+        new_file_path (str): The path to the new JSON file.
+
+    Returns:
+        None
+    """
+    existing_dois = set()
+    with open(existing_file_path, 'r') as f:
+        existing_data = json.load(f)
+        for entry in existing_data:
+            existing_dois.add(entry['doi'])
+
+    with open(new_file_path, 'r') as f:
+        new_data = json.load(f)
+        merged_data = existing_data.copy()
+        for entry in new_data:
+            if entry['doi'] not in existing_dois:
+                merged_data.append(entry)
+
+    with open(existing_file_path, 'w') as f:
+        json.dump(merged_data, f, indent=4)
+
+if __name__ == '__main__':
+    # Get the file paths from the command line arguments
+    existing_file_path = sys.argv[1]
+    new_file_path = sys.argv[2]
+
+    # Call the merge function
+    merge_json_files(existing_file_path, new_file_path)