From 6b37a557f847fd0a785b12b6901239b01bd2b49f Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 16:00:55 +0800
Subject: [PATCH 1/8] notebooks/01: parameterize chain-server url

---
 .gitignore                    |  4 ++++
 notebooks/01_dataloader.ipynb | 27 +++++++++++++++++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7094b42f..38b407a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+# temp files of examples
+model-cache/
+notebooks/dataset/
+
 # Python Exclusions
 .venv
 **__pycache__**
diff --git a/notebooks/01_dataloader.ipynb b/notebooks/01_dataloader.ipynb
index 4fe75990..862b6751 100644
--- a/notebooks/01_dataloader.ipynb
+++ b/notebooks/01_dataloader.ipynb
@@ -7,7 +7,12 @@
    "source": [
     "# Press Release Chat Bot\n",
     "\n",
-    "As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, ```upload_document``` and ```generate```. The ```upload_document``` method takes a document from the user's computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The ```generate``` API method generates an answer from the provided prompt optionally sourcing information from a vector database. "
+    "As part of this generative AI workflow, we create a NVIDIA PR chatbot that answers questions from the NVIDIA news and blogs from years of 2022 and 2023. For this, we have created a REST FastAPI server that wraps llama-index. The API server has two methods, ```upload_document``` and ```generate```. The ```upload_document``` method takes a document from the user's computer and uploads it to a Milvus vector database after splitting, chunking and embedding the document. The ```generate``` API method generates an answer from the provided prompt optionally sourcing information from a vector database.\n",
+    "\n",
+    "\n",
+    "**Pre-requisites:** follow the [Using NVIDIA NIM for LLMs](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html) tutorial, up to the [Build and Start the Containers](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html#build-and-start-the-containers) section. This tutorial requires \\~39 GB disk space for container images (\\~29 GB) and model checkpoints (\\~9GB).\n",
+    "\n",
+    "> To run this example on a machine _without_ any GPU, you need to obtain a key for the NVIDIA API Catalog ([steps](https://nvidia.github.io/GenerativeAIExamples/latest/api-catalog.html#get-an-api-key-for-the-accessing-models-on-the-api-catalog)), then modify step 5A to `export NVIDIA_API_KEY=nvapi-xxx`. This will make step 5B to run containers that will invoke NVIDIA AI APIs."
    ]
   },
   {
@@ -15,9 +20,9 @@
    "id": "4c74eaf2",
    "metadata": {},
    "source": [
-    "#### Step-1: Load the pdf files from the dataset folder.\n",
+    "## Step-1: Load the pdf files from the dataset folder.\n",
     "\n",
-    "You can upload the pdf files containing the NVIDIA blogs to ```query:8081/uploadDocument``` API endpoint"
+    "You can upload the pdf files containing the NVIDIA blogs to ```http://chain_server:8081/documents``` API endpoint."
    ]
   },
   {
@@ -38,6 +43,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "chain_server = 'localhost'  # Assume the API server is co-located with the JupyterLab server.\n",
+    "\n",
     "import os\n",
     "import requests\n",
     "import mimetypes\n",
@@ -63,7 +70,7 @@
     "            file_path = os.path.join(folder_path, files)\n",
     "            print(upload_document(file_path, upload_url))\n",
     "            i += 1\n",
-    "            if i > num_files:\n",
+    "            if i >= num_files:\n",
     "                break"
    ]
   },
@@ -78,7 +85,7 @@
     "\n",
     "start_time = time.time()\n",
     "NUM_DOCS_TO_UPLOAD=100\n",
-    "upload_pdf_files(\"dataset\", \"http://chain-server:8081/documents\", NUM_DOCS_TO_UPLOAD)\n",
+    "upload_pdf_files(\"dataset\", f\"http://{chain_server}:8081/documents\", NUM_DOCS_TO_UPLOAD)\n",
     "print(f\"--- {time.time() - start_time} seconds ---\")"
    ]
   },
@@ -87,8 +94,8 @@
    "id": "830882ef",
    "metadata": {},
    "source": [
-    "#### Step-2 : Ask a question without referring to the knowledge base\n",
-    "Ask Tensorrt LLM llama-2 13B model a question about \"the nvidia grace superchip\" without seeking help from the vectordb/knowledge base by setting ```use_knowledge_base``` to ```false```"
+    "## Step-2 : Ask a question without referring to the knowledge base\n",
+    "Ask TensorRT-LLM llama-2 13B model a question about \"the nvidia grace superchip\" without seeking help from the vectordb/knowledge base by setting ```use_knowledge_base``` to ```false```"
    ]
   },
   {
@@ -112,7 +119,7 @@
     "  \"max_tokens\": 256\n",
     "}\n",
     "\n",
-    "url = \"http://chain-server:8081/generate\"\n",
+    "url = f\"http://{chain_server}:8081/generate\"\n",
     "\n",
     "start_time = time.time()\n",
     "with requests.post(url, stream=True, json=data) as req:\n",
@@ -155,7 +162,7 @@
     "  \"max_tokens\": 50\n",
     "}\n",
     "\n",
-    "url = \"http://chain-server:8081/generate\"\n",
+    "url = f\"http://{chain_server}:8081/generate\"\n",
     "\n",
     "start_time = time.time()\n",
     "tokens_generated = 0\n",
@@ -180,7 +187,7 @@
    "id": "58954d15",
    "metadata": {},
    "source": [
-    "#### Next steps\n",
+    "## Next steps\n",
     "\n",
     "We have setup a playground UI for you to upload files and get answers from, the UI is available on the same IP address as the notebooks: `host_ip:8090/converse`"
    ]

From 0b58c02322344edcd325242b4c9fdbf71df4a76d Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 17:32:17 +0800
Subject: [PATCH 2/8] Fix notebook 02

- update deprecated model names
- fix deserialization error with newer langchain
---
 .gitignore                                    |  1 +
 ..._Option(1)_NVIDIA_AI_endpoint_simple.ipynb | 25 +++++++++++++------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 38b407a5..7e71fc6e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # temp files of examples
 model-cache/
 notebooks/dataset/
+notebooks/toy_data/nv_embedding/
 
 # Python Exclusions
 .venv
diff --git a/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb b/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb
index fdc1fb56..36f1db3c 100644
--- a/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb
+++ b/notebooks/02_Option(1)_NVIDIA_AI_endpoint_simple.ipynb
@@ -79,7 +79,7 @@
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
     "\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "\n",
     "result = llm.invoke(\"Write a ballad about LangChain.\")\n",
     "print(result.content)"
@@ -111,10 +111,10 @@
    "source": [
     "from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings\n",
     "\n",
-    "embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "\n",
     "# Alternatively, if you want to specify whether it will use the query or passage type\n",
-    "# embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")"
+    "# embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")"
    ]
   },
   {
@@ -248,7 +248,20 @@
    "id": "1421512a",
    "metadata": {},
    "source": [
-    "### Step 6c - Read the previously processed & saved Faiss vectore store back"
+    "### Step 6c - Read the previously processed & saved Faiss vectore store back\n",
+    "\n",
+    "<details>\n",
+    "<summary>Notes on <code>allow_dangerous_serialization</code> <code>ValueError</code></summary>\n",
+    "\n",
+    "<p><code>langchain>=0.1.17</code> requires kwarg <code>allow_dangerous_deserialization=True</code>\n",
+    "added to <code>FAISS.load_local()</code> to avoid this error: <i>\"The de-serialization relies loading a pickle file.\n",
+    "Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.\n",
+    "You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you\n",
+    "trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the\n",
+    "file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on \"\n",
+    "the internet.)\"</i> (<a href=\"https://github.com/langchain-ai/langchain/blob/315223ce264d4932f44ca12736619c89340beabe/libs/community/langchain_community/vectorstores/faiss.py#L1076\">ref</a>) .\n",
+    "</p>\n",
+    "</details>"
    ]
   },
   {
@@ -259,8 +272,7 @@
    "outputs": [],
    "source": [
     "# Load the vectorestore back.\n",
-    "\n",
-    "store = FAISS.load_local(\"./toy_data/nv_embedding\", embedder)\n"
+    "store = FAISS.load_local(\"./toy_data/nv_embedding\", embedder, allow_dangerous_deserialization=True)"
    ]
   },
   {
@@ -278,7 +290,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "retriever = store.as_retriever()\n",
     "\n",
     "prompt = ChatPromptTemplate.from_messages(\n",

From fd549df0fa8e5d679c1a3fad5759e5bd1b1aabf0 Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 18:26:51 +0800
Subject: [PATCH 3/8] Update notebook 03

- fix deprecated model name
- remove unused import torch
- relax patch-version of llama-index
---
 ...)_llama_index_with_NVIDIA_AI_endpoint.ipynb | 18 ++++++++++--------
 notebooks/requirements.txt                     |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb b/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb
index 56769a71..a940bba6 100644
--- a/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb
+++ b/notebooks/03_Option(1)_llama_index_with_NVIDIA_AI_endpoint.ipynb
@@ -79,7 +79,7 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "result = llm.invoke(\"Write a ballad about LangChain.\")\n",
     "print(result.content)"
    ]
@@ -101,13 +101,14 @@
    "source": [
     "# Create and dl embeddings instance wrapping huggingface embedding into langchain embedding\n",
     "# Bring in embeddings wrapper\n",
-    "from llama_index.embeddings import LangchainEmbedding\n",
+    "from llama_index.legacy.embeddings import LangchainEmbedding\n",
     "\n",
     "from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings\n",
-    "nv_embedding = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "\n",
+    "nv_embedding = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "li_embedding=LangchainEmbedding(nv_embedding)\n",
     "# Alternatively, if you want to specify whether it will use the query or passage type\n",
-    "# embedder = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")\n"
+    "# embedder = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")"
    ]
   },
   {
@@ -144,7 +145,7 @@
     "    embed_model=li_embedding\n",
     ")\n",
     "# And set the service context\n",
-    "set_global_service_context(service_context)\n"
+    "set_global_service_context(service_context)"
    ]
   },
   {
@@ -159,15 +160,16 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "47a17ce2",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "#create query engine with cross encoder reranker\n",
     "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
-    "import torch\n",
     "\n",
     "documents = SimpleDirectoryReader(\"./toy_data\").load_data()\n",
-    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n"
+    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
    ]
   },
   {
diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt
index c8002465..ccbc1fb6 100644
--- a/notebooks/requirements.txt
+++ b/notebooks/requirements.txt
@@ -4,7 +4,7 @@ python-multipart==0.0.6
 langchain==0.1.9
 unstructured[all-docs]==0.11.2
 sentence-transformers==2.2.2
-llama-index==0.9.22
+llama-index<0.10.0
 dataclass-wizard==0.22.2
 opencv-python==4.8.0.74
 llama-hub==0.0.43

From 641b1020ed13db3ac182013766002ae133f52c65 Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 19:00:46 +0800
Subject: [PATCH 4/8] Fix notebook 04

- fix deprecated model names
- fix nvapi_key not loaded from env var
- fix fuyu() does not include nvapi_key in request
---
 .gitignore                                    |  1 +
 ...tools_leveraging_NVIDIA_AI_endpoints.ipynb | 26 +++++++++----------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7e71fc6e..d3d9b54d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 model-cache/
 notebooks/dataset/
 notebooks/toy_data/nv_embedding/
+notebooks/toy_data/jordan.png
 
 # Python Exclusions
 .venv
diff --git a/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb b/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
index b38f28dd..14b10014 100644
--- a/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
+++ b/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
@@ -34,20 +34,22 @@
     "At the end of the day, as below illustrated, we would like to have a UI which allow user to upload image of their choice and have the agent choose tools to do visual reasoning. \n",
     "\n",
     "![interactive UI](./imgs/visual_reasoning.png)    \n",
-    "Note: As one can see, since we are using NVIDIA AI Catalog as an API, there is no further requirement in the prerequisites about GPUs as compute hardware\n"
+    "Note: As one can see, since we are using NVIDIA AI Catalog as an API, there is no further requirement in the prerequisites about GPUs as compute hardware"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "86843453",
+   "id": "2699edba-8f2a-4473-8cfb-b8667051a619",
    "metadata": {},
    "outputs": [],
    "source": [
+    "'''\n",
     "# uncomment the below to install additional python packages.\n",
     "#!pip install unstructured\n",
     "#!pip install matplotlib scikit-image\n",
-    "!pip install gradio==3.48.0"
+    "!pip install gradio==3.48.0\n",
+    "''';"
    ]
   },
   {
@@ -72,11 +74,11 @@
     "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset\n",
     "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "    nvapi_key = os.environ[\"NVIDIA_API_KEY\"]\n",
     "else:\n",
     "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
-    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key\n",
-    "global nvapi_key"
+    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key"
    ]
   },
   {
@@ -125,14 +127,10 @@
     "\n",
     "\n",
     "\n",
-    "def fuyu(prompt,img_path):\n",
+    "def fuyu(prompt, img_path, nvapi_key=nvapi_key):\n",
     "    invoke_url = \"https://ai.api.nvidia.com/v1/vlm/adept/fuyu-8b\"\n",
     "    stream = True\n",
-    "    \n",
-    "    \n",
     "    image_b64=img2base64_string(img_path)\n",
-    "    \n",
-    "    \n",
     "    assert len(image_b64) < 200_000, \\\n",
     "      \"To upload larger images, use the assets API (see docs)\"\n",
     "\n",
@@ -140,7 +138,7 @@
     "      \"Authorization\": f\"Bearer {nvapi_key}\",\n",
     "      \"Accept\": \"text/event-stream\" if stream else \"application/json\"\n",
     "    }\n",
-    "    \n",
+    "\n",
     "    payload = {\n",
     "      \"messages\": [\n",
     "        {\n",
@@ -154,9 +152,9 @@
     "      \"seed\": 0,\n",
     "      \"stream\": stream\n",
     "    }\n",
-    "    \n",
+    "\n",
     "    response = requests.post(invoke_url, headers=headers, json=payload)\n",
-    "    \n",
+    "\n",
     "    if stream:\n",
     "        output=[]\n",
     "        for line in response.iter_lines():\n",
@@ -216,7 +214,7 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n"
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)"
    ]
   },
   {

From fcaf0fc374c39ddda03bc68cef147f0f1dbac035 Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 19:42:50 +0800
Subject: [PATCH 5/8] Fix notebook 05

- fix nvapi_key does not use NVIDIA_API_KEY env var
- fix faiss-1.8.0 requires allow_dangerous_deserialization=True
- fix regex string
- fix deprecated model names
- make pip install (1st code cell) optional.
---
 .gitignore                                    |  1 +
 ...s_with_Langchain_NVIDIA_AI_Endpoints.ipynb | 86 +++++++++----------
 2 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/.gitignore b/.gitignore
index d3d9b54d..6f0276a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ model-cache/
 notebooks/dataset/
 notebooks/toy_data/nv_embedding/
 notebooks/toy_data/jordan.png
+notebooks/embed
 
 # Python Exclusions
 .venv
diff --git a/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb b/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
index 2f945afb..130401be 100644
--- a/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
+++ b/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "In this notebook we demonstrate how to build a RAG using [NVIDIA AI Endpoints for LangChain](https://python.langchain.com/docs/integrations/text_embedding/nvidia_ai_endpoints). We create a vector store by downloading web pages and generating their embeddings using FAISS. We then showcase two different chat chains for querying the vector store. For this example, we use the NVIDIA Triton documentation website, though the code can be easily modified to use any other source.  \n",
     "\n",
-    "### First stage is to load NVIDIA Triton documentation from the web, chunkify the data, and generate embeddings using FAISS\n",
+    "## First stage is to load NVIDIA Triton documentation from the web, chunkify the data, and generate embeddings using FAISS\n",
     "\n",
     "To get started:\n",
     "\n",
@@ -37,14 +37,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "'''\n",
     "!pip install langchain\n",
     "!pip install langchain_nvidia_ai_endpoints\n",
-    "!pip install faiss-cpu"
+    "!pip install faiss-cpu\n",
+    "''';"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "980506c9",
    "metadata": {},
    "outputs": [],
@@ -70,25 +72,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "bf9a84ac",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Enter your NVIDIA API key:  ······································································\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import getpass\n",
     "\n",
     "if not os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    nvapi_key = getpass.getpass(\"Enter your NVIDIA API key: \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
-    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key"
+    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key\n",
+    "else:\n",
+    "    nvapi_key = os.environ.get(\"NVIDIA_API_KEY\")"
    ]
   },
   {
@@ -96,12 +92,25 @@
    "id": "91fcd102",
    "metadata": {},
    "source": [
-    "Helper functions for loading html files, which we'll use to generate the embeddings. We'll use this later to load the relevant html documents from the Triton documentation website and convert to a vector store."
+    "Helper functions for loading html files, which we'll use to generate the embeddings. We'll use this later to load the relevant html documents from the Triton documentation website and convert to a vector store.\n",
+    "\n",
+    "<details>\n",
+    "<summary>Notes on <code>allow_dangerous_serialization</code> on <code>FAISS.load_local()</code></summary>\n",
+    "\n",
+    "<p><code>langchain>=0.1.17</code> requires kwarg <code>allow_dangerous_deserialization=True</code>\n",
+    "added to <code>FAISS.load_local()</code> to avoid this error: <i>\"The de-serialization relies loading a pickle file.\n",
+    "Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.\n",
+    "You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you\n",
+    "trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the\n",
+    "file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on \"\n",
+    "the internet.)\"</i> (<a href=\"https://github.com/langchain-ai/langchain/blob/315223ce264d4932f44ca12736619c89340beabe/libs/community/langchain_community/vectorstores/faiss.py#L1076\">ref</a>) .\n",
+    "</p>\n",
+    "</details>\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "d84c5ef5",
    "metadata": {},
    "outputs": [],
@@ -145,7 +154,7 @@
     "        text = soup.get_text()\n",
     "\n",
     "        # Remove excess whitespace and newlines\n",
-    "        text = re.sub(\"\\s+\", \" \", text).strip()\n",
+    "        text = re.sub(r\"\\s+\", \" \", text).strip()\n",
     "\n",
     "        return text\n",
     "    except Exception as e:\n",
@@ -166,7 +175,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "6f48635f",
    "metadata": {},
    "outputs": [],
@@ -206,12 +215,12 @@
    "id": "942934e8",
    "metadata": {},
    "source": [
-    "Generate embeddings using NVIDIA AI Endpoints for LangChain and save embeddings to offline vector store in the /embed directory for future re-use"
+    "Generate embeddings using NVIDIA AI Endpoints for LangChain and save embeddings to offline vector store in the /embed directory for future re-use."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "27d1aced",
    "metadata": {},
    "outputs": [],
@@ -229,7 +238,7 @@
     "    Returns:\n",
     "        None\n",
     "    \"\"\"\n",
-    "    embeddings = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n",
+    "    embeddings = NVIDIAEmbeddings(model=\"NV-Embed-QA\")\n",
     "\n",
     "    for document in documents:\n",
     "        texts = splitter.split_text(document.page_content)\n",
@@ -239,7 +248,7 @@
     "\n",
     "        # create embeddings and add to vector store\n",
     "        if os.path.exists(dest_embed_dir):\n",
-    "            update = FAISS.load_local(folder_path=dest_embed_dir, embeddings=embeddings)\n",
+    "            update = FAISS.load_local(folder_path=dest_embed_dir, embeddings=embeddings, allow_dangerous_deserialization=True)\n",
     "            update.add_texts(texts, metadatas=metadatas)\n",
     "            update.save_local(folder_path=dest_embed_dir)\n",
     "        else:\n",
@@ -252,7 +261,7 @@
    "id": "9831f7ba",
    "metadata": {},
    "source": [
-    "### Second stage is to load the embeddings from the vector store and build a RAG using NVIDIAEmbeddings\n",
+    "## Second stage is to load the embeddings from the vector store and build a RAG using NVIDIAEmbeddings\n",
     "\n",
     "Create the embeddings model using NVIDIA Retrieval QA Embedding endpoint. This model represents words, phrases, or other entities as vectors of numbers and understands the relation between words and phrases. See here for reference: https://build.nvidia.com/nvidia/embed-qa-4"
    ]
@@ -262,21 +271,10 @@
    "execution_count": null,
    "id": "f56cadd0",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Storing embeddings to ./embed\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "\n",
-    "\n",
     "create_embeddings()\n",
-    "\n",
-    "embedding_model = NVIDIAEmbeddings(model=\"ai-embed-qa-4\")\n"
+    "embedding_model = NVIDIAEmbeddings(model=\"NV-Embed-QA\")"
    ]
   },
   {
@@ -296,7 +294,7 @@
    "source": [
     "# Embed documents\n",
     "embedding_path = \"embed/\"\n",
-    "docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embedding_model)"
+    "docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embedding_model, allow_dangerous_deserialization=True)"
    ]
   },
   {
@@ -320,7 +318,7 @@
     "\n",
     "question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)\n",
     "\n",
-    "chat = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", temperature=0.1, max_tokens=1000, top_p=1.0)\n",
+    "chat = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", temperature=0.1, max_tokens=1000, top_p=1.0)\n",
     "\n",
     "doc_chain = load_qa_chain(chat , chain_type=\"stuff\", prompt=QA_PROMPT)\n",
     "\n",
@@ -348,7 +346,7 @@
    "outputs": [],
    "source": [
     "query = \"What is Triton?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -368,7 +366,7 @@
    "outputs": [],
    "source": [
     "query = \"What interfaces does Triton support?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -388,7 +386,7 @@
    "outputs": [],
    "source": [
     "query = \"But why?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -438,7 +436,7 @@
    "outputs": [],
    "source": [
     "query = \"What is Triton?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -458,7 +456,7 @@
    "outputs": [],
    "source": [
     "query = \"Does Triton support ONNX?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   },
@@ -478,7 +476,7 @@
    "outputs": [],
    "source": [
     "query = \"But why?\"\n",
-    "result = qa({\"question\": query})\n",
+    "result = qa.invoke({\"question\": query})\n",
     "print(result.get(\"answer\"))"
    ]
   }

From 8211f2ccad01d855004c01d73ce019475036e199 Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 19:54:59 +0800
Subject: [PATCH 6/8] Fix example 06

- fix nvapi_key does not pick-up NVIDIA_API_KEY env var
- parameterized the path to store/load local FAISS embeddings
---
 ..._LangGraph_HandlingAgent_IntermediateSteps.ipynb | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb b/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
index 8a85c771..ef43cb22 100644
--- a/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
+++ b/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
@@ -50,12 +50,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "'''\n",
     "!pip install --upgrade pip\n",
     "!pip install wikipedia==1.4.0\n",
     "!pip install langchain-community==0.2.2\n",
     "!pip install langchain==0.2.2\n",
     "!pip install langgraph==0.0.62\n",
-    "!pip install faiss-gpu==1.7.2"
+    "!pip install faiss-gpu==1.7.2\n",
+    "'''"
    ]
   },
   {
@@ -82,6 +84,7 @@
     "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset\n",
     "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "    nvapi_key = os.environ.get('NVIDIA_API_KEY')\n",
     "else:\n",
     "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
@@ -129,7 +132,7 @@
    "source": [
     "## Step 3 - Retriever from FAISS vector store\n",
     "\n",
-    "We need to process a toy example, here we use `Sweden.txt` from the `toy_data` folder."
+    "We need to process a toy example, here we use `Sweden.txt` from the `toy_data` folder. Please review the path where we'll store the embeddings, and update as necessary."
    ]
   },
   {
@@ -139,6 +142,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "embed_path = \"/workspace/save_embedding/sv'\n",
+    "\n",
     "import os\n",
     "from tqdm import tqdm\n",
     "from pathlib import Path\n",
@@ -177,7 +182,7 @@
     "\n",
     "# you only need to do this once, in the future, when re-run this notebook, skip to below and load the vector store from disk\n",
     "store = FAISS.from_texts(docs, embedder , metadatas=metadatas)\n",
-    "store.save_local('/workspace/save_embedding/sv')\n"
+    "store.save_local(embed_path)\n"
    ]
   },
   {
@@ -188,7 +193,7 @@
    "outputs": [],
    "source": [
     "## If you previously preprocessed and saved the vector store to disk, then reload it here\n",
-    "faissDB = FAISS.load_local(\"/workspace/save_embedding/sv\", embedder, allow_dangerous_deserialization=True)\n",
+    "faissDB = FAISS.load_local(embed_path, embedder, allow_dangerous_deserialization=True)\n",
     "retriever = faissDB.as_retriever()"
    ]
   },

From c962ee213e13a460e393c80d2bd5e4c6ae0230e5 Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 22:06:30 +0800
Subject: [PATCH 7/8] Fix example 07

- Fix nvapi_key does not pick-up the NVIDIA_API_KEY env var
- fix deprecated model names
- document how to start and stop Milvus database on localhost
---
 ...7_Chat_with_nvidia_financial_reports.ipynb | 73 +++++++++++++------
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/notebooks/07_Chat_with_nvidia_financial_reports.ipynb b/notebooks/07_Chat_with_nvidia_financial_reports.ipynb
index 9ad5c16a..43b77663 100644
--- a/notebooks/07_Chat_with_nvidia_financial_reports.ipynb
+++ b/notebooks/07_Chat_with_nvidia_financial_reports.ipynb
@@ -24,7 +24,7 @@
    "id": "612375a9",
    "metadata": {},
    "source": [
-    "### Step 1  - Export the NVIDIA_API_KEY\n",
+    "## Step 1  - Export the NVIDIA_API_KEY\n",
     "Supply the NVIDIA_API_KEY in this notebook when you run the cell below"
    ]
   },
@@ -39,6 +39,7 @@
     "import os\n",
     "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
     "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "    nvapi_key = os.environ['NVIDIA_API_KEY']\n",
     "else:\n",
     "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
     "    assert nvapi_key.startswith(\"nvapi-\"), f\"{nvapi_key[:5]}... is not a valid key\"\n",
@@ -50,7 +51,7 @@
    "id": "5b4afb52",
    "metadata": {},
    "source": [
-    "### Step 2 - initialize the LLM and Embedding Model\n",
+    "## Step 2 - initialize the LLM and Embedding Model\n",
     "Here we will use **mixtral_8x7b** "
    ]
   },
@@ -63,12 +64,11 @@
    "source": [
     "# test run and see that you can genreate a respond successfully\n",
     "from langchain_nvidia_ai_endpoints import ChatNVIDIA,NVIDIAEmbeddings\n",
-    "llm = ChatNVIDIA(model=\"ai-mixtral-8x7b-instruct\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
+    "llm = ChatNVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\", nvidia_api_key=nvapi_key, max_tokens=1024)\n",
     "from langchain.vectorstores import Milvus\n",
-    "import torch\n",
-    "import time\n",
-    "embedder_document = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"passage\")\n",
-    "embedder_query = NVIDIAEmbeddings(model=\"ai-embed-qa-4\", model_type=\"query\")"
+    "\n",
+    "embedder_document = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"passage\")\n",
+    "embedder_query = NVIDIAEmbeddings(model=\"NV-Embed-QA\", model_type=\"query\")"
    ]
   },
   {
@@ -76,7 +76,7 @@
    "id": "d2104106",
    "metadata": {},
    "source": [
-    "### Step 3 - Ingest http files"
+    "## Step 3 - Ingest http files"
    ]
   },
   {
@@ -84,7 +84,7 @@
    "id": "a4e45a22-c883-40cc-b2f4-e6f51866e52b",
    "metadata": {},
    "source": [
-    "#### 3.1 Download http files covering financial reports from Fiscal year 2020 to 2024"
+    "### 3.1 Download http files covering financial reports from Fiscal year 2020 to 2024"
    ]
   },
   {
@@ -107,8 +107,7 @@
     "        if quarter == \"fourth\":\n",
     "            urls_content.append(requests.get(url_template2.format(**args)).content)\n",
     "        else:\n",
-    "            urls_content.append(requests.get(url_template1.format(**args)).content)\n",
-    "\n"
+    "            urls_content.append(requests.get(url_template1.format(**args)).content)"
    ]
   },
   {
@@ -116,7 +115,7 @@
    "id": "8cbada93-6b0c-49df-a6b9-781f1a2400fb",
    "metadata": {},
    "source": [
-    "#### 3.2 Parse html files"
+    "### 3.2 Parse html files"
    ]
   },
   {
@@ -159,8 +158,7 @@
     "    soup = BeautifulSoup(url_content, 'html.parser')\n",
     "    url, title, content, tables = extract_url_title_time(soup)\n",
     "    parsed_htmls.append({\"url\":url, \"title\":title, \"content\":content, \"tables\":tables})\n",
-    "\n",
-    "\n"
+    "print(f\"Document count: {len(parsed_htmls)}\")"
    ]
   },
   {
@@ -168,14 +166,18 @@
    "id": "d985444a-8ef2-47d7-ba13-1a4da369f468",
    "metadata": {},
    "source": [
-    "#### 3.3 Summarize tables"
+    "### 3.3 Summarize tables\n",
+    "\n",
+    "It takes ~4 minutes to summarize the tables in the 2020-2024 reports."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "8bb88c6c-8167-4b6e-9b83-5a6937f9f9fb",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "# summarize tables\n",
@@ -200,11 +202,10 @@
     "    finally:\n",
     "        return res\n",
     "\n",
-    "\n",
-    "for parsed_item in parsed_htmls:\n",
+    "for doc_idx, parsed_item in enumerate(parsed_htmls):\n",
     "    title = parsed_item['title']\n",
     "    for idx, table in enumerate(parsed_item['tables']):\n",
-    "        print(f\"parsing tables in {title}...\")\n",
+    "        print(f\"Document {doc_idx} -- parsing tables {idx} in {title}...\")\n",
     "        table = get_table_summary(table, title, llm)\n",
     "        parsed_item['tables'][idx] = table\n"
    ]
@@ -214,7 +215,7 @@
    "id": "ccfd4607-3479-4cdd-b120-1f416959cb23",
    "metadata": {},
    "source": [
-    "#### 3.4 Split the text/table in chunks, extract embedding for each chunk, and store the embeddinges into milvus vectordb"
+    "### 3.4 Split the text/table in chunks, extract embedding for each chunk, and store the embeddinges into milvus vectordb"
    ]
   },
   {
@@ -252,6 +253,14 @@
     "print(f\"obtain {len(documents)} chunks\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2ba6dd03-adeb-493d-92b4-a03104d9f190",
+   "metadata": {},
+   "source": [
+    "Deploy a local Milvus server -- see step 7 on this [tutorial section](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html#build-and-start-the-containers). Then, proceed to the next cell."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -261,15 +270,19 @@
    "source": [
     "COLLECTION_NAME = \"NVIDIA_Finance\"\n",
     "from langchain.vectorstores import Milvus\n",
+    "\n",
     "vectorstore = Milvus(\n",
     "    embedding_function=embedder_document,\n",
     "    collection_name=COLLECTION_NAME,\n",
     "    connection_args={\n",
-    "        \"host\": \"milvus\",\n",
+    "        \"host\": \"localhost\",\n",
     "        \"port\": \"19530\"},\n",
     "    drop_old = True,\n",
     "    auto_id = True\n",
     "    )\n",
+    "\n",
+    "# Use langchain-core<=0.2.10 to avoid TypeError: 'NoneType' object is not subscriptable\n",
+    "# See: https://github.com/langchain-ai/langchain/issues/24116#issuecomment-2223984425\n",
     "vectorstore.add_documents(documents)\n",
     "docs = vectorstore.similarity_search(\"what are 2024 Q3 revenues? \")"
    ]
@@ -289,7 +302,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "from langchain.prompts.prompt import PromptTemplate\n",
     "\n",
     "PROMPT_TEMPLATE = \"\"\"[INST]You are a friendly virtual assistant and maintain a conversational, polite, patient, friendly and gender neutral tone throughout the conversation.\n",
@@ -321,7 +333,6 @@
     "prompt_template = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=[\"context\", \"question\"])\n",
     "\n",
     "\n",
-    "\n",
     "def build_context(chunks):\n",
     "    context = \"\"\n",
     "    for chunk in chunks:\n",
@@ -342,6 +353,22 @@
     "\n",
     "generate_answer(llm, vectorstore, prompt_template, question)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bf7b05e-53bf-4a9c-87d1-50b0077d6b2e",
+   "metadata": {},
+   "source": [
+    "Lastly, remember to stop the Milvus container. See step 1 in this [tutorial section](https://nvidia.github.io/GenerativeAIExamples/latest/nim-llms.html#stopping-the-containers)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15970f1e-fbc7-4134-adf5-f60d8d10221a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From f852709665955a6f0c886fe9744b1311e961c8e8 Mon Sep 17 00:00:00 2001
From: Verdi March <vmarch@nvidia.com>
Date: Mon, 29 Jul 2024 22:13:18 +0800
Subject: [PATCH 8/8] Restore pip install cells from main

---
 .../04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb   | 4 +---
 ...RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb | 4 +---
 notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb  | 4 +---
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb b/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
index 14b10014..06c63700 100644
--- a/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
+++ b/notebooks/04_Agent_use_tools_leveraging_NVIDIA_AI_endpoints.ipynb
@@ -44,12 +44,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "'''\n",
     "# uncomment the below to install additional python packages.\n",
     "#!pip install unstructured\n",
     "#!pip install matplotlib scikit-image\n",
-    "!pip install gradio==3.48.0\n",
-    "''';"
+    "!pip install gradio==3.48.0"
    ]
   },
   {
diff --git a/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb b/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
index 130401be..02e17aa8 100644
--- a/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
+++ b/notebooks/05_RAG_for_HTML_docs_with_Langchain_NVIDIA_AI_Endpoints.ipynb
@@ -37,11 +37,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "'''\n",
     "!pip install langchain\n",
     "!pip install langchain_nvidia_ai_endpoints\n",
-    "!pip install faiss-cpu\n",
-    "''';"
+    "!pip install faiss-cpu"
    ]
   },
   {
diff --git a/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb b/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
index ef43cb22..4270977d 100644
--- a/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
+++ b/notebooks/06_LangGraph_HandlingAgent_IntermediateSteps.ipynb
@@ -50,14 +50,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "'''\n",
     "!pip install --upgrade pip\n",
     "!pip install wikipedia==1.4.0\n",
     "!pip install langchain-community==0.2.2\n",
     "!pip install langchain==0.2.2\n",
     "!pip install langgraph==0.0.62\n",
-    "!pip install faiss-gpu==1.7.2\n",
-    "'''"
+    "!pip install faiss-gpu==1.7.2"
    ]
   },
   {