Notes

Notes - notes.io

import os
import pandas as pd
from io import StringIO
from lxml import etree
from langchain_core.documents import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.csv import partition_csv
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.text import partition_text
from unstructured.partition.html import partition_html
from unstructured.partition.xml import partition_xml
from unstructured.partition.email import partition_email
from unstructured.partition.docx import partition_docx
# from ctransformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.prompts import PromptTemplate
import warnings
import torch
import json
import pymongo # Adding pymongo for MongoDB

class Processor:

def __init__(self):
# MongoDB setup
self.mongo_client = pymongo.MongoClient("mongodb://localhost:27017/") # Update with your MongoDB URI if different
self.db = self.mongo_client["navy_qna_maker"] # Create/select the database
self.collection = self.db["navy_qna_maker"] # Create/select the collection

# Model and environment setup
self.base_path = "/home/bhuvnesh/Documents/Rag/Doc-Chat/data"
warnings.filterwarnings('ignore')
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model_path = "/home/bhuvnesh/Downloads/Phi-3-mini-128k-instruct-oct7/"
self.phi3_model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
trust_remote_code=True
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)

self.pipeline = pipeline(
"text-generation",
model=self.phi3_model,
tokenizer=self.tokenizer,
max_new_tokens=200,
do_sample=False,
num_return_sequences=1
)
# self.llm = AutoModelForCausalLM.from_pretrained(
# "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
# model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
# model_type="mistral",
# gpu_layers=80
# )
# self.custom_prompt_template = """[INST]
# Context: The following information is from {source}. Please extract meaningful questions and answers based on the provided context.

# {context}

# Task:
# 1. Generate relevant questions based on the context.
# 2. Provide answers to each of those questions.
# 3. Return the questions and answers in a **pure JSON format**, without any additional text.

# Expected output:
# {{
# "Question":{{"question": "<question>", "answer": "<answer>"}}
# }}
# [/INST]"""
self.custom_prompt_template = """[INST]
Context: The following information is from {source}. Please extract meaningful questions and answers based on the provided context.

{context}

Task:
1. Generate **only one** relevant question based on the context.
2. Provide an answer to that question.
3. Return the question and answer in a **pure JSON format**, without any additional text.

Expected output:
{{
"questions": {{"question": "<question>", "answer": "<answer>"}}
}}
[/INST]"""
self.prompt = PromptTemplate(template=self.custom_prompt_template,
input_variables=['source', 'context'])

def process_files(self):
directory_path = "/home/bhuvnesh/Documents/Rag/Doc-Chat/data"
files = os.listdir(directory_path)
for file_name in files:
print(f'This is the file name: {file_name}')
file_path = os.path.join(directory_path, file_name)
file_store_path = os.path.join(self.base_path, "chromadb_data")
print(file_store_path)

if not os.path.exists(file_store_path):
os.makedirs(file_store_path)

if file_path.endswith(".pdf"):
document_elements = partition_pdf(file_path)
self.process_docs(file_path, file_store_path, document_elements)

elif file_path.endswith(".csv"):
csv_elements = partition_csv(filename=file_path, strategy="fast")
self.process_docs(file_path, file_store_path, csv_elements)

elif file_path.endswith(".pptx"):
pptx_elements = partition_pptx(filename=file_path)
self.process_docs(file_path, file_store_path, pptx_elements)

elif file_path.endswith(".docx"):
docx_elements = partition_docx(filename=file_path)
self.process_docs(file_path, file_store_path, docx_elements)

elif file_path.endswith(".xlsx"):
excel_elements = partition_xlsx(filename=file_path)
self.process_docs(file_path, file_store_path, excel_elements)

elif file_path.endswith(".txt"):
text_element = partition_text(filename=file_path)
self.process_docs(file_path, file_store_path, text_element)

elif file_path.endswith(".html"):
html_element = partition_html(filename=file_path)
self.process_eml_xml_html(file_path, file_store_path, html_element)

elif file_path.endswith(".xml"):
xml_element = partition_xml(filename=file_path)
self.process_eml_xml_html(file_path, file_store_path, xml_element)

elif file_path.endswith(".eml"):
eml_element = partition_email(filename=file_path)
self.process_eml_xml_html(file_path, file_store_path, eml_element)

def process_docs(self, file_path, path, document_elements):
try:
elements = chunk_by_title(document_elements, max_characters=300, overlap=50)

tables = [el for el in document_elements if el.category == "Table"]
if tables:
table_html = tables[0].metadata.text_as_html
if table_html:
parser = etree.XMLParser(remove_blank_text=True)
file_obj = StringIO(table_html)
tree = etree.parse(file_obj, parser)

reference_title = None
for el in document_elements:
if el.text == "References" and el.category == "Title":
reference_title = el
break

if reference_title:
references_id = reference_title.id
document_elements = [el for el in document_elements if el.metadata.parent_id != references_id]

headers = [el for el in document_elements if el.category == "Header"]
if headers:
document_elements = [el for el in document_elements if el.category != "Header"]

for element in elements:
try:
metadata = element.metadata.to_dict()
if "languages" in metadata:
del metadata["languages"]
metadata["source"] = metadata.get("filename", file_path)
for key, value in metadata.items():
if isinstance(value, list):
metadata[key] = ', '.join(map(str, value))
elif not isinstance(value, (str, int, float, bool)):
metadata[key] = str(value)

formatted_prompt = self.prompt.format(source='Indian Navy guidelines manual', context=element.text)

# response = self.llm(
# formatted_prompt,
# max_new_tokens=300,
# temperature=0.2,
# top_k=1,
# top_p=0.50,
# repetition_penalty=1.1
# )
response=self.pipeline(formatted_prompt)
generated_text = response[0]['generated_text']

# Find the start and end of the JSON part within the generated_text
json_start = generated_text.find('{')
json_end = generated_text.rfind('}') + 1
# Extract the JSON string
json_string= generated_text[json_start:json_end]
print("JSON_STRING",json_string)
json_string=json_string.split("/INST")[1]
start = json_string.find('{')
end= json_string.rfind('}') + 1
json_string=json_string[start:end]

response_json=json.loads(json_string)
# Add metadata to each question item
# for item in response_json['questions']:
# item['source'] = metadata['source']
# item['page_number'] = metadata.get('page_number', 'N/A')
# # Add default if page_number is missing
response_json['questions']['source'] = metadata['source']
response_json['questions']['page_number'] = metadata.get('page_number', 'N/A')

print(response_json) # For debugging, to see the generated JSON

# MongoDB insertion

self.collection.insert_one(response_json['questions']) # Insert each question-answer pair into MongoDB

except Exception as e:
print(f"Error processing document element: {e}")
pass

except Exception as e:
print(f"An error occurred while processing {file_path}: {e}")

# Initialize and run the processor
processor = Processor()
processor.process_files()

Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...

With notes.io;

* You can take a note from anywhere and any device with internet connection.
* You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
* You can quickly share your contents without website, blog and e-mail.
* You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
* Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.

Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.

Easy: Notes.io doesn’t require installation. Just write and share note!

Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )

Free: Notes.io works for 14 years and has been free since the day it was started.

You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;

Email: [email protected]

Twitter: http://twitter.com/notesio

Instagram: http://instagram.com/notes.io

Facebook: http://facebook.com/notesio

Regards;
Notes.io Team

Notes

Notes - notes.io

Shortened Note Link

Long File

Notes