Skip to content

Commit a170338

Browse files
committed
improved chunks, added page to the llm, improved the system prompt and the markdown of the LLM response
1 parent 3338a74 commit a170338

13 files changed

Lines changed: 239 additions & 89 deletions

File tree

app/Http/Integrations/Qdrant/Requests/QueryRequest.php

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,30 @@ public function defaultBody(): array
6161

6262
return $body;
6363
}
64+
65+
protected function random(): void
66+
{
67+
// $response = Http::withToken(
68+
// ""
69+
// )->put(
70+
// "",
71+
// [
72+
// "vectors" => [
73+
// "size" => 384,
74+
// "distance" => "Cosine"
75+
// ]
76+
// ]
77+
// );
78+
79+
// $response = Http::withToken(
80+
// ""
81+
// )->put(
82+
// "",
83+
// [
84+
// "field_name" => "doc_id",
85+
// "field_schema" => "keyword"
86+
// ]
87+
// );
88+
89+
}
6490
}

app/Jobs/ProcessChunkJob.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ class ProcessChunkJob implements ShouldQueue
1919
{
2020
use Batchable, Queueable;
2121

22-
public $tries = 6;
22+
public $tries = 10;
2323

24-
public $backoff = 20;
24+
public $backoff = 65;
2525

2626
/**
2727
* Create a new job instance.
@@ -62,6 +62,7 @@ public function handle(): void
6262
'page' => $this->chunk['page'] ?? null,
6363
'chunk_index' => $this->chunk['chunk_index'],
6464
'text' => $this->chunk['text'],
65+
'pages_spanned' => $this->chunk['pages_spanned'],
6566
],
6667
]);
6768

app/Jobs/ProcessDocumentJob.php

Lines changed: 6 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,16 @@ public function handle(): void
3131

3232
$fileId = $file->id;
3333

34-
$pdfText = app(PdfService::class)->getPdfText($file->path);
34+
$pdfService = app(PdfService::class);
3535

36-
$chunks = $this->chuckText($pdfText, 1500, 500);
36+
$documentText = $pdfService->getPdfText($file->path);
37+
38+
$chunks = $pdfService->chunkText($documentText);
3739

3840
$chuckCount = count($chunks);
3941

42+
Log::info('chunks', ['chunks' => $chuckCount]);
43+
4044
$jobs = [];
4145

4246
// $chunks = array_slice($chunks, 0, 2);
@@ -66,43 +70,6 @@ public function handle(): void
6670
->dispatch();
6771
}
6872

69-
protected function chuckText(
70-
string $text,
71-
int $chunkSize = 3000,
72-
int $overlap = 500
73-
): array {
74-
$chunks = [];
75-
$len = mb_strlen($text);
76-
77-
$start = 0;
78-
$index = 0;
79-
80-
while ($start < $len) {
81-
$end = min($start + $chunkSize, $len);
82-
$chunk = mb_substr($text, $start, $end - $start);
83-
84-
$chunks[] = [
85-
'text' => trim($chunk),
86-
'chunk_index' => $index,
87-
'char_start' => $start,
88-
'char_end' => $end,
89-
];
90-
91-
$index++;
92-
93-
if ($end >= $len) {
94-
break;
95-
}
96-
97-
$start = $end - $overlap;
98-
if ($start < 0) {
99-
$start = 0;
100-
}
101-
}
102-
103-
return $chunks;
104-
}
105-
10673
protected function sequenceJobs(): void
10774
{
10875
// foreach ($chunks as $chunk) {

app/Jobs/ProcessUserQueryJob.php

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,17 @@
1212
use Illuminate\Contracts\Queue\ShouldQueue;
1313
use Illuminate\Foundation\Queue\Queueable;
1414
use Illuminate\Support\Facades\Log;
15+
use Prism\Prism\ValueObjects\Messages\AssistantMessage;
16+
use Prism\Prism\ValueObjects\Messages\UserMessage;
1517

1618
class ProcessUserQueryJob implements ShouldQueue
1719
{
1820
use Queueable;
1921

22+
public $tries = 3;
23+
24+
public $backoff = 10;
25+
2026
/**
2127
* Create a new job instance.
2228
*/
@@ -40,24 +46,58 @@ public function handle(): void
4046

4147
$results = VectorDatabase::search($payload);
4248

49+
Log::info(['results count' => $results]);
50+
4351
Log::info('Results', ['results' => $results]);
4452

4553
$context = '';
4654

4755
foreach ($results as $i => $h) {
4856
$p = $h['payload'];
49-
$context .= "---CHUNK {$i}---\n[doc: {$p['doc_id']}, page: {$p['page']}] \n".$p['text']."\n\n";
50-
}
5157

52-
$prompt = "\n\nContext:\n{$context}\nUser: {$this->message->message}\nAnswer:";
58+
$pageLabel = '';
5359

54-
$llmResponse = Llm::prompt(prompt: $prompt);
60+
$pages = $p['pages_spanned'] ?? [];
5561

56-
Log::info('LLM Response', ['llmResponse' => $llmResponse]);
62+
if (is_array($pages)) {
63+
if (count($pages) > 1) {
64+
$pageLabel = 'pages '.implode(', ', $pages);
65+
} elseif (count($pages) === 1) {
66+
$pageLabel = 'page '.$pages[0];
67+
} else {
68+
$pageLabel = 'page '.($p['page'] ?? 'N/A');
69+
}
70+
} else {
71+
$pageLabel = 'page '.$pages;
72+
}
73+
74+
Log::info(['pageLabel' => $pageLabel]);
75+
76+
$context .= "---CHUNK {$i}---\n[doc: {$p['doc_id']}, {$pageLabel}]\n".$p['text']."\n\n";
77+
}
78+
79+
Log::info('Context', ['context' => $context]);
5780

5881
/** @var Conversation $conversation */
5982
$conversation = $this->message->conversation;
6083

84+
$prismMessages = $conversation->messages->map(function ($message) {
85+
/** @phpstan-ignore-next-line */
86+
return $message->participant == MessageParticipant::USER
87+
/** @phpstan-ignore-next-line */
88+
? new UserMessage($message->message)
89+
/** @phpstan-ignore-next-line */
90+
: new AssistantMessage($message->message);
91+
})->all();
92+
93+
$prompt = "\n\nContext:\n{$context}\nUser: {$this->message->message}\nAnswer:";
94+
95+
$prismMessages[] = new UserMessage($prompt);
96+
97+
$llmResponse = Llm::prompt(prismMessages: $prismMessages);
98+
99+
Log::info('LLM Response', ['llmResponse' => $llmResponse]);
100+
61101
$conversation->messages()->create([
62102
'user_id' => $this->message->user_id,
63103
'message' => $llmResponse,

app/Providers/AppServiceProvider.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public function register(): void
2222
public function boot(): void
2323
{
2424
RateLimiter::for('pdf-processing', function () {
25-
return Limit::perMinute(100);
25+
return Limit::perMinute(99);
2626
});
2727
}
2828
}

app/Services/Llm/Driver/GeminiDriver.php

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,12 @@ public function prompt(?array $prismMessages = [], ?User $user = null, ?string $
1818
try {
1919
return Prism::text()
2020
->using(Provider::Gemini, 'gemini-2.5-flash')
21-
->withSystemPrompt("You are a helpful AI assistant that primarily answers questions based on the legal documents provided in the Context. When a user asks a question related to the Context, use the information from it as your main source and cite each reference inline using the page id format (e.g., [Page 1]).
22-
If the user's question is general, conversational, or unrelated to the provided Context, you may respond naturally and helpfully using your own general knowledge — do NOT reply with “I couldn't find relevant information in the provided documents.”
23-
Always aim to be informative, friendly, and accurate. When relevant, include citations from the Context, but when not relevant, engage normally as a helpful assistant while replying 'No relevant information found in the provided documents.'.
24-
Never, never, never leave a response empty please!.")
25-
// ->withMessages($prismMessages)
26-
->withPrompt($prompt)
27-
->withMaxTokens(512)
28-
->usingTemperature(0.0)
21+
->withSystemPrompt($this->getSystemPrompt())
22+
->withMessages($prismMessages)
23+
// ->withPrompt($prompt)
24+
->withMaxTokens(8000)
25+
->withClientOptions(['timeout' => 60])
26+
->usingTemperature(0.2)
2927
->asText()->text;
3028
} catch (PrismException $e) {
3129
Log::error('Prompt generation failed:', [
@@ -57,4 +55,25 @@ public function embed(?string $texts = null, ?string $path = null): array
5755
return [];
5856
}
5957
}
58+
59+
protected function getSystemPrompt(): string
60+
{
61+
return "You are a helpful and informative AI assistant specializing in legal document analysis. Your primary goal is to answer questions accurately by prioritizing the information provided in the **Context** section below.
62+
63+
### Core RAG Rules:
64+
1. **Grounded Answers:** When a user asks a question related to the Context, use only the retrieved documents as your source.
65+
2. **Citations:** For every piece of information drawn from the Context, you **must** cite the reference inline using the document and page format, such as **[page: PAGE_NUM]**.
66+
3. **Calculation Priority (New Rule):** If the user asks a question requiring arithmetic, you **must** use the tax bands and rates available in the Context to perform the calculation. State the final answer. **If specific initial income brackets are missing but subsequent brackets and rates are present, you must use reasonable assumptions derived from the common structure of the progressive tax system to complete the calculation.**
67+
4. **Non-Relevant Queries:** If the user's question is general, conversational, or unrelated to the provided Context, you must ignore the Context and respond naturally and helpfully using your own general knowledge. Do not reference the documents or apologize for not finding information.
68+
5. **No Empty Responses:** Under no circumstances should you ever return an empty response.
69+
70+
Always aim to be informative, friendly, and accurate. ";
71+
72+
// You must output the step-by-step arithmetic before stating
73+
// 'You are a helpful AI assistant that primarily answers questions based on the legal documents provided in the Context. When a user asks a question related to the Context, use the information from it as your main source and cite each reference inline using the page id format (e.g., [Page 1]).
74+
// If the user's question is general, conversational, or unrelated to the provided Context, you may respond naturally and helpfully using your own general knowledge — do NOT reply with “I couldn't find relevant information in the provided documents.”
75+
// Always aim to be informative, friendly, and accurate. When relevant, include citations from the Context, but when not relevant, engage normally as a helpful assistant while replying 'No relevant information found in the provided documents.'.
76+
// Never, never, never leave a response empty please!.'
77+
78+
}
6079
}

app/Services/Pdf/PdfService.php

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,91 @@ public function getPdf(string $filename): string
3131
return Storage::disk('public')->path($pdf);
3232
}
3333

34-
public function getPdfText(string $filename): string
34+
public function getPdfText(string $filename): array
3535
{
3636
$parser = new Parser;
3737
$pdf = $parser->parseFile($this->getPdf($filename));
3838

39-
return $pdf->getText();
39+
$pages = [];
40+
foreach ($pdf->getPages() as $pageNumber => $page) {
41+
$pages[] = [
42+
'page' => $pageNumber + 1, // human-readable page number
43+
'text' => $page->getText(),
44+
];
45+
}
46+
47+
return $pages;
4048
}
4149

42-
// public function chuckPdfToText(
43-
// string $filename,
44-
// int $chunkSize = 3000,
45-
// int $overlap = 500,
46-
// ): array
47-
// {
48-
//
49-
// }
50+
public function chunkText(
51+
array $documentText,
52+
int $chunkSize = 6000,
53+
int $overlap = 500
54+
): array {
55+
$fullText = '';
56+
$pageMap = [];
57+
$currentPosition = 0;
58+
59+
foreach ($documentText as $data) {
60+
$pageMap[] = [
61+
'start' => $currentPosition,
62+
'page' => $data['page'],
63+
];
64+
65+
$fullText .= $data['text'];
66+
67+
$currentPosition += mb_strlen($data['text']);
68+
}
69+
70+
$pageMap[] = ['start' => $currentPosition, 'page' => null];
71+
72+
$chunks = [];
73+
$len = mb_strlen($fullText);
74+
75+
$start = 0;
76+
$index = 0;
77+
78+
while ($start < $len) {
79+
$end = min($start + $chunkSize, $len);
80+
$chunk = mb_substr($fullText, $start, $end - $start);
81+
82+
$pagesSpanned = [];
83+
$firstPage = null;
84+
85+
for ($i = 0; $i < count($pageMap) - 1; $i++) {
86+
$pageStart = $pageMap[$i]['start'];
87+
$nextPageStart = $pageMap[$i + 1]['start'];
88+
$pageNumber = $pageMap[$i]['page'];
89+
90+
if ($start < $nextPageStart && $end > $pageStart) {
91+
if ($firstPage === null) {
92+
$firstPage = $pageNumber;
93+
}
94+
$pagesSpanned[] = $pageNumber;
95+
}
96+
}
97+
98+
$chunks[] = [
99+
'text' => trim($chunk),
100+
'chunk_index' => $index,
101+
'char_start' => $start,
102+
'char_end' => $end,
103+
'page' => $firstPage,
104+
'pages_spanned' => array_unique($pagesSpanned),
105+
];
106+
107+
$index++;
108+
109+
if ($end >= $len) {
110+
break;
111+
}
112+
113+
$start = $end - $overlap;
114+
if ($start < 0) {
115+
$start = 0;
116+
}
117+
}
118+
119+
return $chunks;
120+
}
50121
}

app/Services/VectorDatabase/Driver/QdrantDriver.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ public function search(QdrantSearchPayload $data): array
4747
Log::error('Vector database search failed', [
4848
'status' => $response->status(),
4949
// 'body' => $response->body(),
50-
// 'json' => $response->json(),
50+
'json' => $response->json(),
5151
]);
5252
}
5353

0 commit comments

Comments
 (0)