full-stack-python-rag-project/app.py at main · MYounus-Codes/full-stack-python-rag-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
"""
Streamlit Web UI for RAG System.

Provides file upload, document processing, and chat interface.
"""

import streamlit as st
import os
import tempfile
from typing import List, Tuple
from src.config import Config
from src.rag import DocumentProcessor, RAGChain
from src.utils.helpers import setup_logger, format_context

logger = setup_logger(__name__)


# Page configuration
st.set_page_config(
    page_title="RAG Document Assistant",
    page_icon="📚",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        font-weight: bold;
        color: #1f77b4;
        margin-bottom: 1rem;
    }
    .response-box {
        background-color: #f0f2f6;
        padding: 1.5rem;
        border-radius: 0.5rem;
        margin: 1rem 0;
    }
    .source-box {
        background-color: #e8f4f8;
        padding: 1rem;
        border-left: 4px solid #1f77b4;
        margin: 0.5rem 0;
    }
</style>
""", unsafe_allow_html=True)


def initialize_session_state():
    """Initialize Streamlit session state variables."""
    if "processor" not in st.session_state:
        st.session_state.processor = None
    if "rag_chain" not in st.session_state:
        st.session_state.rag_chain = None
    if "documents_processed" not in st.session_state:
        st.session_state.documents_processed = 0
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []
    if "uploaded_files" not in st.session_state:
        st.session_state.uploaded_files = []


def validate_configuration():
    """Validate that all required configuration is present."""
    try:
        Config.validate()
        return True
    except ValueError as e:
        st.error(f"❌ Configuration Error: {str(e)}")
        st.info("Please create a `.env` file with the required API keys")
        return False


def process_uploaded_files(uploaded_files) -> int:
    """
    Process uploaded files and prepare them for RAG.

    Args:
        uploaded_files: List of uploaded files from Streamlit

    Returns:
        int: Number of chunks processed
    """
    if not uploaded_files:
        return 0

    try:
        with st.spinner("Processing documents..."):
            # Initialize processor if not already done
            if st.session_state.processor is None:
                st.session_state.processor = DocumentProcessor()

            processor = st.session_state.processor
            total_chunks = 0

            # Create temp directory for uploaded files
            with tempfile.TemporaryDirectory() as temp_dir:
                for uploaded_file in uploaded_files:
                    # Save uploaded file to temp directory
                    file_path = os.path.join(temp_dir, uploaded_file.name)

                    with open(file_path, "wb") as f:
                        f.write(uploaded_file.getbuffer())

                    # Process the file
                    chunks = processor.process_file(file_path, uploaded_file.name)
                    total_chunks += chunks

                    st.session_state.uploaded_files.append(uploaded_file.name)

            # Initialize RAG chain after processing
            if st.session_state.rag_chain is None:
                st.session_state.rag_chain = RAGChain()

            st.session_state.documents_processed = total_chunks
            return total_chunks

    except Exception as e:
        message = str(e)
        if "429" in message or "rate" in message.lower() or "quota" in message.lower():
            st.error(
                "Rate limit hit while generating embeddings. Wait a moment or use a key with higher quota."
            )
        else:
            st.error(f"Error processing documents: {message}")
        logger.error(f"Error processing documents: {e}")
        return 0


def display_chat_interface():
    """Display the chat interface."""
    st.subheader("💬 Chat Interface")

    # Display chat history
    if st.session_state.chat_history:
        for message in st.session_state.chat_history:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

    # Chat input
    if st.session_state.documents_processed > 0:
        user_input = st.chat_input(
            "Ask a question about your documents...",
            disabled=st.session_state.rag_chain is None
        )

        if user_input:
            # Add user message to history
            st.session_state.chat_history.append({
                "role": "user",
                "content": user_input
            })

            with st.chat_message("user"):
                st.markdown(user_input)

            # Get response from RAG chain
            with st.spinner("Processing your question..."):
                try:
                    result = st.session_state.rag_chain.query(user_input)
                    answer = result.get("answer", "No answer generated")
                    source_docs = result.get("source_documents", [])

                    # Format response with sources
                    response_content = answer
                    if source_docs:
                        response_content += "\n\n**📖 Source Documents:**"
                        for i, doc in enumerate(source_docs, 1):
                            source_text = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
                            source_file = doc.metadata.get("source", "Unknown")
                            response_content += f"\n\n**Document {i}** ({source_file}):\n{source_text}"

                    # Add assistant response to history
                    st.session_state.chat_history.append({
                        "role": "assistant",
                        "content": response_content
                    })

                    # Display response
                    with st.chat_message("assistant"):
                        st.markdown(answer)
                        if source_docs:
                            st.divider()
                            st.markdown("**📖 Source Documents:**")
                            for i, doc in enumerate(source_docs, 1):
                                with st.expander(f"Document {i}: {doc.metadata.get('source', 'Unknown')}"):
                                    st.markdown(doc.page_content[:500])

                except Exception as e:
                    st.error(f"Error processing query: {str(e)}")
                    logger.error(f"Error processing query: {e}")
    else:
        st.info("📤 Please upload documents first to start chatting.")


def main():
    """Main Streamlit application."""
    # Header
    st.markdown('<h1 class="main-header">📚 RAG Document Assistant</h1>', unsafe_allow_html=True)
    st.write("Upload your documents and ask questions about them using AI-powered retrieval.")

    # Initialize session state
    initialize_session_state()

    # Validate configuration
    if not validate_configuration():
        return

    # Create two columns
    col1, col2 = st.columns([1, 2])

    # Left column - Document Management
    with col1:
        st.subheader("📤 Document Management")

        # File uploader
        uploaded_files = st.file_uploader(
            "Upload documents (.txt, .pdf, .docx)",
            type=["txt", "pdf", "docx"],
            accept_multiple_files=True,
            key="file_uploader"
        )

        if uploaded_files:
            st.success(f"✅ {len(uploaded_files)} file(s) selected")

            if st.button("📤 Process Documents", key="process_btn"):
                chunks = process_uploaded_files(uploaded_files)
                if chunks > 0:
                    st.success(f"✅ Documents processed! Created {chunks} chunks.")
                    st.balloons()

        # Display statistics
        if st.session_state.documents_processed > 0:
            st.divider()
            st.subheader("📊 Statistics")
            col_a, col_b = st.columns(2)
            with col_a:
                st.metric("Chunks Processed", st.session_state.documents_processed)
            with col_b:
                st.metric("Files Uploaded", len(st.session_state.uploaded_files))

            if st.session_state.uploaded_files:
                st.subheader("📋 Uploaded Files")
                for file in st.session_state.uploaded_files:
                    st.write(f"• {file}")

        # Clear data button
        if st.button("🗑️ Clear All Data"):
            st.session_state.processor = None
            st.session_state.rag_chain = None
            st.session_state.documents_processed = 0
            st.session_state.chat_history = []
            st.session_state.uploaded_files = []
            st.success("All data cleared!")
            st.rerun()

    # Right column - Chat Interface
    with col2:
        display_chat_interface()

    # Footer
    st.divider()
    st.markdown("""
    <div style="text-align: center; color: gray; margin-top: 2rem;">
        <small>RAG System powered by LangChain, Pinecone, and Google Gemini</small>
    </div>
    """, unsafe_allow_html=True)


if __name__ == "__main__":
    main()