Why iam keep Reading the PDF files again and again when i use PDFImageReader from Phi data if those pdf files got present in the pgvector, but if i set PDFReader as default its skip reading the files please help me out #1946
Unanswered
mahendra867
asked this question in
Q&A
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
when i set PDFReader as default the code logic is working fine like its skipping reading the pdf files ,but when i set PDFImageReader as default its not skipping the reading the pdf files even though they got exist under single table what is the issue please help me out
code
import os
from dotenv import load_dotenv
from phi.agent import Agent
from phi.embedder.azure_openai import AzureOpenAIEmbedder
from phi.knowledge.pdf import PDFKnowledgeBase, PDFImageReader, PDFReader
from phi.vectordb.pgvector import PgVector, SearchType
from phi.model.openai import OpenAIChat
from phi.storage.agent.postgres import PgAgentStorage
from sqlalchemy import create_engine, inspect, text
from phi.vectordb.pgvector.index import Ivfflat, HNSW
from phi.embedder.openai import OpenAIEmbedder
Load environment variables
load_dotenv()
print("Environment variables loaded.")
Fetch API keys and endpoint from environment variables
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
Function to check if the table exists
def check_table_exists(engine, schema, table_name):
print(f"Checking if table '{table_name}' exists in schema '{schema}'...")
inspector = inspect(engine)
exists = inspector.has_table(table_name, schema=schema)
print(f"Table '{table_name}' exists: {exists}")
return exists
Function to check if a PDF file is already in the database
def is_pdf_in_db(engine, schema, table_name, pdf_name):
pdf_name_no_ext = os.path.splitext(pdf_name)[0] # Remove the .pdf extension
print(f"Checking if PDF '{pdf_name_no_ext}' exists in table '{table_name}'...")
with engine.connect() as connection:
result = connection.execute(
text(f"SELECT 1 FROM {schema}.{table_name} WHERE name = :name"),
{"name": pdf_name_no_ext}
)
exists = result.fetchone() is not None
print(f"PDF '{pdf_name_no_ext}' exists in table '{table_name}': {exists}")
return exists
Set up the PDF knowledge base with vector database
print("Setting up PDF knowledge base with vector database...")
pdf_knowledge_base = PDFKnowledgeBase(
path="/home/ubuntu/rag-strider-arm64/agentic_new_rag/pdfs",
vector_db=PgVector(
table_name="updated_rag81",
schema='ai',
db_url=db_url,
search_type=SearchType.hybrid,
vector_index=HNSW(),
embedder = OpenAIEmbedder(
api_key=OPENAI_API_KEY,
model="text-embedding-ada-002",
dimensions=1536,
encoding_format="float"
)
),
reader=PDFReader(), # Use a default reader
documents=3,
)
Define the PgAgentStorage with connection to database
#storage = PgAgentStorage(table_name="updated_rag81", schema="ai", db_url=db_url)
print("PgAgentStorage initialized.")
Create a SQLAlchemy engine
engine = create_engine(db_url)
Before loading, check if the table exists and create if not
if not check_table_exists(engine, "ai", "updated_rag81"):
print("Table does not exist. Creating the table...")
pdf_knowledge_base.load(recreate=True, upsert=True) # Create the table
else:
print("Table exists. Skipping table creation...")
pdf_knowledge_base.load(recreate=False, skip_existing=True) # Skip existing table
Check if PDFs are already in the database and process accordingly
pdf_folder = "D:\Geak Minds Projects\agentic_new_rag\pdfs"
for pdf_file in os.listdir(pdf_folder):
if pdf_file.endswith(".pdf"):
if not is_pdf_in_db(engine, "ai", "updated_rag81", pdf_file):
print(f"Processing {pdf_file}...")
# Process the PDF file with PDFImageReader
pdf_reader = PDFImageReader(chunk=True)
pdf_reader.read(os.path.join(pdf_folder, pdf_file))
else:
print(f"Skipping {pdf_file}, already in database.")
Initialize the RAG agent
print("Initializing RAG Agent...")
rag_agent = Agent(
name="Agentic RAG Application",
agent_id="rag-agent",
model=OpenAIChat(id="gpt-4o-mini"),
knowledge=pdf_knowledge_base,
add_context=True,
search_knowledge=True,
read_chat_history=True,
debug_mode=True,
#storage=storage,
description=(
"You are an intelligent retrieval assistant specialized in utilizing knowledge stored in "
"a curated set of documents related to the Cybersecurity "
),
markdown=True
)
print("RAG Agent initialized.")
Print the agent's response to a query
rag_agent.print_response("give me Figure 5 - Assessment file directory after script execution", stream=True)
Beta Was this translation helpful? Give feedback.
All reactions