update readme with link

cohere-ai · May 10, 2024 · 44ff2e8 · 44ff2e8
2 parents 4ca78b5 + fce43c7
commit 44ff2e8
Show file tree

Hide file tree

Showing 12 changed files with 4,177 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 
 
 # verification results are commited only in jsonl.gz
-results/verification/*.jsonl
+results/verifications/*.jsonl
 
 # space for gitignored experiments and hq figures
 experiments/

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# Code for the paper "Fishing for Magikarps"
+# Code for the paper "Fishing for Magikarp"
 
-Paper: [add link]
+This repository contains the code and extended results for the paper [Fishing for Magikarp: Automatically Detecting Under-trained Tokens in Large Language Models](https://arxiv.org/abs/2405.05417)
 
 ## Exploring Results
 
@@ -13,7 +13,7 @@ The most interesting thing in this repository is probably the detailed reports,
 
 ### Setup
 
-<details><summary>This is a standard [poetry](https://python-poetry.org/) project.</summary>
+<details><summary>This is a standard <a href="https://python-poetry.org/">poetry</a> project.</summary>
 
 ```bash
 poetry shell   # make/activate your virtual environment
@@ -26,14 +26,14 @@ poetry install # only the first time or on updates
 
 See `run_verification.sh` for some example commands for running new models. The script itself is mainly a reference for reproducibility and it is not recommended to run.
 
-For models with tied embeddings, or for nicer visualizations, you will need to hard-code some unused token ids in `magikarp/unused_tokens.py`.
+For models with tied embeddings, or for nicer visualizations and results, you will need to hard-code some unused token ids in `magikarp/unused_tokens.py`.
 
 * If a related model already exists, copying the token ids is likely to work just fine.
 * For non-tied embeddings you can typically just let verification finish, and update unused tokens after you get the results.
 * For the rare case of new model families with tied embeddings:
     * Take a guess, like `[0]`, or use the tokenizer vocabulary to pick some.
-    * Run the `fishing.py` script and kill it when it starts verifying.
-    * You now have `verifications/yourmodel.jsonl` which allows you to look at the vocabulary and update suitable tokens.
+    * Run the `magikarp/fishing.py` script and kill it when it starts verifying.
+    * You now have `results/verifications/yourmodel.jsonl` which allows you to look at the vocabulary and update suitable tokens.
     * Update your unused tokens, and run verification.
 
 ### Generating results

diff --git a/magikarp/report.py b/magikarp/report.py
@@ -258,15 +258,21 @@ def make_tokens_report(model_id, toka, moda, token_infos, metric_ix, save_hires=
         ti for ti in token_infos.values() if "UNDECODEABLE" in ti["category"] and ti["i"] not in exclude
     ]
 
+    # plot verifications first, so we can look at them even if we don't have a threshold
+    _, verification_filename = verification_plot(
+        model_id, verification_candidates, metric_names[metric_ix], save_hires=save_hires
+    )
+    _, metrics_filename = metrics_pairplot(token_infos, toka, moda, color_by_id=True, save_hires=save_hires)
+
+    # find threshold for table collapse
     p_verify_threshold = 0.01
     window = 12
     frac_verified_thr = 2.0 / 3
     verifications_below_threshold = np.array([c["max_prob"] < p_verify_threshold for c in candidates_without_excl])
 
     first_below_thr = window
-    while first_below_thr + window < len(
-        candidates_without_excl
-    ):  # find threshold where verification rate drops below 2/3
+    # find threshold where verification rate drops below 2/3
+    while first_below_thr + window < len(candidates_without_excl):
         if (
             verifications_below_threshold[first_below_thr - window : first_below_thr + window + 1].mean()
             < frac_verified_thr
@@ -316,12 +322,6 @@ def make_tokens_report(model_id, toka, moda, token_infos, metric_ix, save_hires=
         find_superstrings_in=token_infos.values(),
     )
 
-    # plot verifications
-    _, verification_filename = verification_plot(
-        model_id, verification_candidates, metric_names[metric_ix], save_hires=save_hires
-    )
-    _, metrics_filename = metrics_pairplot(token_infos, toka, moda, color_by_id=True, save_hires=save_hires)
-
     # make a giant markdown file and write it
 
     # summary

diff --git a/magikarp/unused_tokens.py b/magikarp/unused_tokens.py
@@ -16,6 +16,9 @@
 YI_UNUSED_TOKENS = np.arange(145, 305)  #  <|unused_token
 JAMBA_UNUSED_TOKENS = np.arange(4, 515)  # <|maskxxx|>
 
+DEEPSEEK_LANG_UNUSED_TOKENS = np.arange(100002, 100015)  # unused utf8
+DEEPSEEK_CODE_UNUSED_TOKENS = np.arange(171, 173)  # f1/f2
+
 # Defines reference unused tokens for models
 # optional for most models, but also functions as a kind of registry of models to process
 UNUSED_TOKENS = {
@@ -66,4 +69,6 @@
     "bigcode/starcoder2-15b": STARCODER2_UNUSED_TOKENS,
     "01-ai/Yi-9B": YI_UNUSED_TOKENS,
     "ai21labs/Jamba-v0.1": JAMBA_UNUSED_TOKENS,
+    "deepseek-ai/deepseek-llm-7b-base": DEEPSEEK_LANG_UNUSED_TOKENS,
+    "deepseek-ai/deepseek-coder-33b-base": DEEPSEEK_CODE_UNUSED_TOKENS,
 }
diff --git a/results/metrics_pairplot_byid/deepseek_ai_deepseek_coder_33b_base.png b/results/metrics_pairplot_byid/deepseek_ai_deepseek_coder_33b_base.png
diff --git a/results/metrics_pairplot_byid/deepseek_ai_deepseek_llm_7b_base.png b/results/metrics_pairplot_byid/deepseek_ai_deepseek_llm_7b_base.png