NVIDIA · verdimrc · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,10 @@
+# temp files of examples
+model-cache/
+notebooks/dataset/
+notebooks/toy_data/nv_embedding/
+notebooks/toy_data/jordan.png
+notebooks/embed
+
 # Python Exclusions
 .venv
 **__pycache__**

diff --git a/notebooks/01_dataloader.ipynb b/notebooks/01_dataloader.ipynb
@@ -7,17 +7,22 @@
    "source": [
     "# Press Release Chat Bot\n",
     "\n",
-    "As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, ```upload_document``` and ```generate```. The ```upload_document``` method takes a document from the user's computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The ```generate``` API method generates an answer from the provided prompt optionally sourcing information from a vector database. "
+    "As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, ```upload_document``` and ```generate```. The ```upload_document``` method takes a document from the user's computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The ```generate``` API method generates an answer from the provided prompt optionally sourcing information from a vector database.\n",
+    "\n",
+    "\n",
+    "**Pre-requisites:** follow the [Using NVIDIA NIM for LLMs](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html) tutorial, up to the [Build and Start the Containers](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html#build-and-start-the-containers) section. This tutorial requires \\~39 GB disk space for container images (\\~29 GB) and model checkpoints (\\~9GB).\n",
+    "\n",
+    "> To run this example on a machine _without_ any GPU, you need to obtain a key for the NVIDIA API Catalog ([steps](https://nvidia.github.io/GenerativeAIExamples/latest/api-catalog.html#get-an-api-key-for-the-accessing-models-on-the-api-catalog)), then modify step 5A to `export NVIDIA_API_KEY=nvapi-xxx`. This will make step 5B to run containers that will invoke NVIDIA AI APIs."
    ]
   },
   {
    "cell_type": "markdown",
    "id": "4c74eaf2",
    "metadata": {},
    "source": [
-    "#### Step-1: Load the pdf files from the dataset folder.\n",
+    "## Step-1: Load the pdf files from the dataset folder.\n",
     "\n",
-    "You can upload the pdf files containing the NVIDIA blogs to ```query:8081/uploadDocument``` API endpoint"
+    "You can upload the pdf files containing the NVIDIA blogs to ```http://chain_server:8081/documents``` API endpoint."
    ]
   },
   {
@@ -38,6 +43,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "chain_server = 'localhost'  # Assume the API server is co-located with the JupyterLab server.\n",
+    "\n",
     "import os\n",
     "import requests\n",
     "import mimetypes\n",
@@ -63,7 +70,7 @@
     "            file_path = os.path.join(folder_path, files)\n",
     "            print(upload_document(file_path, upload_url))\n",
     "            i += 1\n",
-    "            if i > num_files:\n",
+    "            if i >= num_files:\n",
     "                break"
    ]
   },
@@ -78,7 +85,7 @@
     "\n",
     "start_time = time.time()\n",
     "NUM_DOCS_TO_UPLOAD=100\n",
-    "upload_pdf_files(\"dataset\", \"http://chain-server:8081/documents\", NUM_DOCS_TO_UPLOAD)\n",
+    "upload_pdf_files(\"dataset\", f\"http://{chain_server}:8081/documents\", NUM_DOCS_TO_UPLOAD)\n",
     "print(f\"--- {time.time() - start_time} seconds ---\")"
    ]
   },
@@ -87,8 +94,8 @@
    "id": "830882ef",
    "metadata": {},
    "source": [
-    "#### Step-2 : Ask a question without referring to the knowledge base\n",
-    "Ask Tensorrt LLM llama-2 13B model a question about \"the nvidia grace superchip\" without seeking help from the vectordb/knowledge base by setting ```use_knowledge_base``` to ```false```"
+    "## Step-2 : Ask a question without referring to the knowledge base\n",
+    "Ask TensorRT-LLM llama-2 13B model a question about \"the nvidia grace superchip\" without seeking help from the vectordb/knowledge base by setting ```use_knowledge_base``` to ```false```"
    ]
   },
   {
@@ -112,7 +119,7 @@
     "  \"max_tokens\": 256\n",
     "}\n",
     "\n",
-    "url = \"http://chain-server:8081/generate\"\n",
+    "url = f\"http://{chain_server}:8081/generate\"\n",
     "\n",
     "start_time = time.time()\n",
     "with requests.post(url, stream=True, json=data) as req:\n",
@@ -155,7 +162,7 @@
     "  \"max_tokens\": 50\n",
     "}\n",
     "\n",
-    "url = \"http://chain-server:8081/generate\"\n",
+    "url = f\"http://{chain_server}:8081/generate\"\n",
     "\n",
     "start_time = time.time()\n",
     "tokens_generated = 0\n",
@@ -180,7 +187,7 @@
    "id": "58954d15",
    "metadata": {},
    "source": [
-    "#### Next steps\n",
+    "## Next steps\n",
     "\n",
     "We have setup a playground UI for you to upload files and get answers from, the UI is available on the same IP address as the notebooks: `host_ip:8090/converse`"
    ]

diff --git a/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb b/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb
@@ -79,7 +79,7 @@
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
     "\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "\n",
     "result = llm.invoke(\"Write a ballad about LangChain.\")\n",
     "print(result.content)"
@@ -111,10 +111,10 @@
    "source": [
     "from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings\n",
     "\n",
-    "embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "\n",
     "# Alternatively, if you want to specify whether it will use the query or passage type\n",
-    "# embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")"
+    "# embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")"
    ]
   },
   {
@@ -248,7 +248,20 @@
    "id": "1421512a",
    "metadata": {},
    "source": [
-    "### Step 6c - Read the previously processed & saved Faiss vectore store back"
+    "### Step 6c - Read the previously processed & saved Faiss vectore store back\n",
+    "\n",
+    "<details>\n",
+    "<summary>Notes on <code>allow_dangerous_serialization</code> <code>ValueError</code></summary>\n",
+    "\n",
+    "<p><code>langchain>=0.1.17</code> requires kwarg <code>allow_dangerous_deserialization=True</code>\n",
+    "added to <code>FAISS.load_local()</code> to avoid this error: <i>\"The de-serialization relies loading a pickle file.\n",
+    "Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.\n",
+    "You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you\n",
+    "trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the\n",
+    "file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on \"\n",
+    "the internet.)\"</i> (<a href=\"https://github.com/langchain-ai/langchain/blob/315223ce264d4932f44ca12736619c89340beabe/libs/community/langchain_community/vectorstores/faiss.py#L1076\">ref</a>) .\n",
+    "</p>\n",
+    "</details>"
    ]
   },
   {
@@ -259,8 +272,7 @@
    "outputs": [],
    "source": [
     "# Load the vectorestore back.\n",
-    "\n",
-    "store = FAISS.load_local(\"./toy_data/nv_embedding\", embedder)\n"
+    "store = FAISS.load_local(\"./toy_data/nv_embedding\", embedder, allow_dangerous_deserialization=True)"
    ]
   },
   {
@@ -278,7 +290,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "retriever = store.as_retriever()\n",
     "\n",
     "prompt = ChatPromptTemplate.from_messages(\n",

diff --git a/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb b/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb
@@ -79,7 +79,7 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "result = llm.invoke(\"Write a ballad about LangChain.\")\n",
     "print(result.content)"
    ]
@@ -101,13 +101,14 @@
    "source": [
     "# Create and dl embeddings instance wrapping huggingface embedding into langchain embedding\n",
     "# Bring in embeddings wrapper\n",
-    "from llama_index.embeddings import LangchainEmbedding\n",
+    "from llama_index.legacy.embeddings import LangchainEmbedding\n",
     "\n",
     "from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings\n",
-    "nv_embedding = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "\n",
+    "nv_embedding = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "li_embedding=LangchainEmbedding(nv_embedding)\n",
     "# Alternatively, if you want to specify whether it will use the query or passage type\n",
-    "# embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")\n"
+    "# embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")"
    ]
   },
   {
@@ -144,7 +145,7 @@
     "    embed_model=li_embedding\n",
     ")\n",
     "# And set the service context\n",
-    "set_global_service_context(service_context)\n"
+    "set_global_service_context(service_context)"
    ]
   },
   {
@@ -159,15 +160,16 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "47a17ce2",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "#create query engine with cross encoder reranker\n",
     "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
-    "import torch\n",
     "\n",
     "documents = SimpleDirectoryReader(\"./toy_data\").load_data()\n",
-    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n"
+    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
    ]
   },
   {

diff --git a/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb b/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
@@ -34,13 +34,13 @@
     "At the end of the day, as below illustrated, we would like to have a UI which allow user to upload image of their choice and have the agent choose tools to do visual reasoning. \n",
     "\n",
     "![interactive UI](./imgs/visual_reasoning.png)    \n",
-    "Note: As one can see, since we are using NVIDIA AI Catalog as an API, there is no further requirement in the prerequisites about GPUs as compute hardware\n"
+    "Note: As one can see, since we are using NVIDIA AI Catalog as an API, there is no further requirement in the prerequisites about GPUs as compute hardware"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "86843453",
+   "id": "2699edba-8f2a-4473-8cfb-b8667051a619",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -72,11 +72,11 @@
     "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset\n",
     "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "    nvapi_key = os.environ[\"NVIDIA_API_KEY\"]\n",
     "else:\n",
     "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
-    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key\n",
-    "global nvapi_key"
+    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key"
    ]
   },
   {
@@ -125,22 +125,18 @@
     "\n",
     "\n",
     "\n",
-    "def fuyu(prompt,img_path):\n",
+    "def fuyu(prompt, img_path, nvapi_key=nvapi_key):\n",
     "    invoke_url = \"https://ai.api.nvidia.com/v1/vlm/adept/fuyu-8b\"\n",
     "    stream = True\n",
-    "    \n",
-    "    \n",
     "    image_b64=img2base64_string(img_path)\n",
-    "    \n",
-    "    \n",
     "    assert len(image_b64) < 200_000, \\\n",
     "      \"To upload larger images, use the assets API (see docs)\"\n",
     "\n",
     "    headers = {\n",
     "      \"Authorization\": f\"Bearer {nvapi_key}\",\n",
     "      \"Accept\": \"text/event-stream\" if stream else \"application/json\"\n",
     "    }\n",
-    "    \n",
+    "\n",
     "    payload = {\n",
     "      \"messages\": [\n",
     "        {\n",
@@ -154,9 +150,9 @@
     "      \"seed\": 0,\n",
     "      \"stream\": stream\n",
     "    }\n",
-    "    \n",
+    "\n",
     "    response = requests.post(invoke_url, headers=headers, json=payload)\n",
-    "    \n",
+    "\n",
     "    if stream:\n",
     "        output=[]\n",
     "        for line in response.iter_lines():\n",
@@ -216,7 +212,7 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n"
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)"
    ]
   },
   {