From 57a21c156ad8748d74975d2debae4e7674e9a06f Mon Sep 17 00:00:00 2001 From: Marcin Kwiatkowski Date: Mon, 29 Sep 2025 19:38:22 +0200 Subject: [PATCH 1/5] docs: removed empty docs page --- .../Adjusting Token Budgets & Segmentation.md | 9 --------- .../Docs/Customization/Custom Prompt Templates.md | 9 --------- .../Switching Models (Ollama Integration).md | 10 ---------- .../Docs/NitroDigest \342\200\223 Documentation.md" | 5 ----- .../Docs/Use Cases/Summarizing Slack Messages.md | 9 --------- 5 files changed, 42 deletions(-) delete mode 100644 Projects/Nitrodigest/Docs/Customization/Adjusting Token Budgets & Segmentation.md delete mode 100644 Projects/Nitrodigest/Docs/Customization/Custom Prompt Templates.md delete mode 100644 Projects/Nitrodigest/Docs/Customization/Switching Models (Ollama Integration).md delete mode 100644 Projects/Nitrodigest/Docs/Use Cases/Summarizing Slack Messages.md diff --git a/Projects/Nitrodigest/Docs/Customization/Adjusting Token Budgets & Segmentation.md b/Projects/Nitrodigest/Docs/Customization/Adjusting Token Budgets & Segmentation.md deleted file mode 100644 index d29cacb..0000000 --- a/Projects/Nitrodigest/Docs/Customization/Adjusting Token Budgets & Segmentation.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -permalink: projects/nitrodigest/docs/customization/adjusting-token-budgets-and-segmentation ---- -Under construction - ---- -Found an issue? Report a bug: - -#NitroDigest #Docs #NitroDigestDocs diff --git a/Projects/Nitrodigest/Docs/Customization/Custom Prompt Templates.md b/Projects/Nitrodigest/Docs/Customization/Custom Prompt Templates.md deleted file mode 100644 index de3796d..0000000 --- a/Projects/Nitrodigest/Docs/Customization/Custom Prompt Templates.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -permalink: projects/nitrodigest/docs/customization/custom-prompt-templates ---- -Under construction - ---- -Found an issue? Report a bug: - -#NitroDigest #Docs #NitroDigestDocs diff --git a/Projects/Nitrodigest/Docs/Customization/Switching Models (Ollama Integration).md b/Projects/Nitrodigest/Docs/Customization/Switching Models (Ollama Integration).md deleted file mode 100644 index 8cc5787..0000000 --- a/Projects/Nitrodigest/Docs/Customization/Switching Models (Ollama Integration).md +++ /dev/null @@ -1,10 +0,0 @@ ---- -permalink: projects/nitrodigest/docs/customization/switching-models ---- - -Under construction - ---- -Found an issue? Report a bug: - -#NitroDigest #Docs #NitroDigestDocs diff --git "a/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" "b/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" index 8a17dd2..86a9fc6 100644 --- "a/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" +++ "b/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" @@ -32,12 +32,7 @@ Note: this documentation is under development just like the tool. Not all sectio - **Use Cases** - [Summarizing Email Newsletters](Summarizing%20Email%20Newsletters.md) - [Summarizing Web Pages](Summarizing%20Web%20Pages.md) - - [Summarizing Slack Messages](Summarizing%20Slack%20Messages.md) - [Summarizing GitHub Pull Requests](Summarizing%20GitHub%20Pull%20Requests.md) -- **Customization** - - [Custom Prompt Templates](Custom%20Prompt%20Templates.md) - - [Switching Models (Ollama Integration)](Switching%20Models%20(Ollama%20Integration).md) - - [Adjusting Token Budgets & Segmentation](Adjusting%20Token%20Budgets%20&%20Segmentation.md) - **Contributing** - [Getting started](Getting%20started.md) - [Ollama setup](Ollama%20setup.md) diff --git a/Projects/Nitrodigest/Docs/Use Cases/Summarizing Slack Messages.md b/Projects/Nitrodigest/Docs/Use Cases/Summarizing Slack Messages.md deleted file mode 100644 index 7c16c6d..0000000 --- a/Projects/Nitrodigest/Docs/Use Cases/Summarizing Slack Messages.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -permalink: projects/nitrodigest/docs/use-cases/summarizing-slack-messages ---- -Under construction - ---- -Found an issue? Report a bug: - -#NitroDigest #Docs #NitroDigestDocs From b0070add4a193dc77f66cdcd3643c6afebbb2de9 Mon Sep 17 00:00:00 2001 From: Marcin Kwiatkowski Date: Mon, 29 Sep 2025 21:11:02 +0200 Subject: [PATCH 2/5] feat: added --include-original flag Added flag that allows to include original text in the generated output --- .../Docs/Getting Started/Quickstart.md | 10 ++++ .../Guides/Understanding the Output Format.md | 55 +++++++++++++++++++ .../Guides/Using a Custom Configuration.md | 16 ++++++ Projects/Nitrodigest/README.md | 3 +- Projects/Nitrodigest/src/cli/main.py | 43 ++++++++++----- 5 files changed, 113 insertions(+), 14 deletions(-) diff --git a/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md b/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md index 72537b6..3647e78 100644 --- a/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md +++ b/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md @@ -39,6 +39,16 @@ When the process is done, you can simply see the summary from file. Example: cat summary.md ``` +## 4. Including Original Text (Optional) + +If you want to include the original text alongside the summary, use the `--include-original` flag: + +```bash +nitrodigest example.txt --include-original > summary-with-original.md +``` + +This will append the original text to the output after the summary, which can be useful for reference or comparison purposes. + Read next: - [Summarizing All Files in a Directory](Summarizing%20All%20Files%20in%20a%20Directory.md) diff --git a/Projects/Nitrodigest/Docs/Guides/Understanding the Output Format.md b/Projects/Nitrodigest/Docs/Guides/Understanding the Output Format.md index 6c4a37c..c7ad14f 100644 --- a/Projects/Nitrodigest/Docs/Guides/Understanding the Output Format.md +++ b/Projects/Nitrodigest/Docs/Guides/Understanding the Output Format.md @@ -183,6 +183,61 @@ Create a summary table for this document: | Timeline | Important dates or deadlines | ``` +### Including Original Text + +By default, NitroDigest only outputs the summary. You can include the original text alongside the summary using the `--include-original` flag: + +```bash +# Include original text with summary +nitrodigest document.txt --include-original +``` + +**Text Format Output with Original:** + +```yaml +--- +title: document-name.txt +source: file:///absolute/path/to/document-name.txt +date: '2025-05-16 07:50:22' +id: document-name.txt +summary_date: '2025-05-26 07:55:46' +model: mistral +tokens: 189 +--- + +# Summary + +1. Key summary points here... +2. Additional summary content... + +# Tags + +1. tag1 +2. tag2 + +--- + +## Original Text + +[The complete original text content would appear here] +``` + +**JSON Format with Original:** + +```json +{ + "summary": ["Summary content here..."], + "tags": ["tag1", "tag2"], + "metadata": { + "title": "document.txt", + "source": "file:///path/to/document.txt", + "date": "2025-05-16 07:50:22", + "id": "document.txt" + }, + "original_text": "The complete original text content would appear here" +} +``` + ### JSON Structured Output You can use the `--format` flag to change output format to JSON: diff --git a/Projects/Nitrodigest/Docs/Guides/Using a Custom Configuration.md b/Projects/Nitrodigest/Docs/Guides/Using a Custom Configuration.md index c2f150e..9b508b2 100644 --- a/Projects/Nitrodigest/Docs/Guides/Using a Custom Configuration.md +++ b/Projects/Nitrodigest/Docs/Guides/Using a Custom Configuration.md @@ -95,6 +95,22 @@ Change output format to `json`: nitrodigest document.txt --format json ``` +### Include Original Text + +**Default:** `False` + +Include the original text alongside the summary in the output: + +```bash +# Include original text in summary output +nitrodigest document.txt --include-original + +# Exclude original text (default behavior) +nitrodigest document.txt +``` + +When the `--include-original` flag is present, the original text will be appended after the summary in text format, or included as an `original_text` field in JSON format. + ## Setting Up Default Configurations ### Environment Variables diff --git a/Projects/Nitrodigest/README.md b/Projects/Nitrodigest/README.md index ef5de0b..b8ea879 100644 --- a/Projects/Nitrodigest/README.md +++ b/Projects/Nitrodigest/README.md @@ -63,7 +63,8 @@ Available arguments: - `--prompt`: Direct prompt content (overrides prompt-file) - `--model`: Model that will be used for summarization (default: mistral) - `--ollama_api_url`: URL of Ollama API (default: ) -- `--format`: Output format. Can be `text` or `json` (default: text)" +- `--format`: Output format. Can be `text` or `json` (default: text) +- `--include-original`: Include original text in the summary output (default: False) ### Custom Prompt Configuration diff --git a/Projects/Nitrodigest/src/cli/main.py b/Projects/Nitrodigest/src/cli/main.py index 05424de..1e30dd2 100644 --- a/Projects/Nitrodigest/src/cli/main.py +++ b/Projects/Nitrodigest/src/cli/main.py @@ -55,6 +55,12 @@ def main(): default="text", help="Output format. Can be 'text' or 'json' (default: text)" ) + parser.add_argument( + "--include-original", + default=False, + action="store_true", + help="Include original text in the summary output" + ) args = parser.parse_args() @@ -93,14 +99,17 @@ def main(): if not sys.stdin.isatty(): content = sys.stdin.read() - process_text(content, summarizer, args.format) + process_text(content, summarizer, args.format, args.include_original) else: if os.path.isfile(args.content): - process_file(args.content, summarizer, args.format) + process_file(args.content, summarizer, + args.format, args.include_original) elif os.path.isdir(args.content): - process_directory(args.content, summarizer, args.format) + process_directory(args.content, summarizer, + args.format, args.include_original) else: - process_text(args.content, summarizer, args.format) + process_text(args.content, summarizer, + args.format, args.include_original) # Clean up a temporary prompt file if it was created if (args.prompt and config.prompt_file and @@ -110,7 +119,7 @@ def main(): return 0 -def process_text(content: str, summarizer: OllamaSummarizer, format: str) -> int: +def process_text(content: str, summarizer: OllamaSummarizer, format: str, include_original: bool) -> int: try: logger.info("Processing text...") @@ -120,19 +129,18 @@ def process_text(content: str, summarizer: OllamaSummarizer, format: str) -> int "source": "text" } - return _generate_summary(content, summarizer, metadata, format) + return _generate_summary(content, summarizer, metadata, format, include_original) except Exception as e: logger.error(f"Error processing text: {e}") return -1 -def process_file(file_path, summarizer, format: str): +def process_file(file_path, summarizer, format: str, include_original: bool): """Process a single file for summarization""" try: logger.info(f"Processing file: {file_path}") - # Read the file content with open(file_path, 'r', encoding='utf-8') as f: content = f.read() @@ -140,7 +148,6 @@ def process_file(file_path, summarizer, format: str): logger.warning(f"Warning: File '{file_path}' is empty") return -1 - # Create metadata from file info file_name = os.path.basename(file_path) metadata = { 'title': file_name, @@ -150,13 +157,13 @@ def process_file(file_path, summarizer, format: str): } logger.info(f"Generating summary for {file_name}...") - return _generate_summary(content, summarizer, metadata, format) + return _generate_summary(content, summarizer, metadata, format, include_original) except Exception: raise -def process_directory(directory_path, summarizer, format: str): +def process_directory(directory_path, summarizer, format: str, include_original: bool): """Process all text files in a directory for summarization""" logger.info(f"Processing directory: {directory_path}") @@ -169,7 +176,8 @@ def process_directory(directory_path, summarizer, format: str): if filename.lower().endswith(('.txt', '.md', '.html', '.htm', '.xml', '.json', '.csv', '.log')): file_path = os.path.join(root, filename) try: - process_file(file_path, summarizer, format) + process_file(file_path, summarizer, + format, include_original) success_count += 1 logger.info(f"File {success_count} processed successfully") except Exception as e: @@ -182,7 +190,7 @@ def process_directory(directory_path, summarizer, format: str): f"Directory processing complete: {success_count} of {file_count} files processed successfully") -def _generate_summary(content, summarizer, metadata, format): +def _generate_summary(content, summarizer, metadata, format, include_original=True) -> int: result = summarizer.summarize(content, metadata) if not result.is_success(): @@ -210,9 +218,18 @@ def _generate_summary(content, summarizer, metadata, format): ) print('---\n') print(_json_to_text(summary)) + + if include_original: + print("\n---\n") + print("## Original Text\n") + print(content) elif format == 'json': json_summary = json.loads(summary) json_summary["metadata"] = metadata + + if include_original: + json_summary["original_text"] = content + print(json.dumps(json_summary, ensure_ascii=False, indent=2)) else: print(summary) From a1351b8e994dc2adcb256c865765b41ef7cde2a5 Mon Sep 17 00:00:00 2001 From: Marcin Kwiatkowski Date: Tue, 30 Sep 2025 21:43:20 +0200 Subject: [PATCH 3/5] feat: summarize in current working directory When running nitrodigst without arguments, it will try to summarize files in CWD. --- .../Docs/Getting Started/Quickstart.md | 10 ++++++++++ .../Summarizing All Files in a Directory.md | 17 +++++++++++++++++ Projects/Nitrodigest/README.md | 6 ++++++ Projects/Nitrodigest/src/cli/main.py | 5 +++++ 4 files changed, 38 insertions(+) diff --git a/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md b/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md index 3647e78..a3a326c 100644 --- a/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md +++ b/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md @@ -15,6 +15,16 @@ nitrodigest example.txt This command will use the default settings to summarize `example.txt`. The tool will connect to the local Ollama model, generate a summary, and save the result. +### Alternative: Process Current Directory + +You can also run NitroDigest without any arguments to process all supported files in your current directory: + +```bash +nitrodigest +``` + +This will automatically find and summarize all text files (`.txt`, `.md`, `.html`, `.json`, `.csv`, `.log`, etc.) in the current working directory. + ## 3. Observe the output You should see console messages indicating the file is being processed and where the summary is saved. For example: diff --git a/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md b/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md index 120f303..e53aa39 100644 --- a/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md +++ b/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md @@ -18,6 +18,16 @@ This command will: 3. Process each file individually using your default model 4. Output each summary to the terminal in sequence +### Process Current Directory + +You can also run NitroDigest without any arguments to automatically process all supported files in your current working directory: + +```bash +nitrodigest +``` + +This is particularly useful when you're already in the directory you want to process and don't want to specify the path explicitly. + ## Supported File Types NitroDigest automatically processes files with these extensions: @@ -115,6 +125,13 @@ For directories with just a few files, processing is straightforward and fast: nitrodigest my_notes/ ``` +Or if you're already in the directory: + +```bash +cd my_notes/ +nitrodigest +``` + ## Directory Processing Behavior ### Recursive Processing diff --git a/Projects/Nitrodigest/README.md b/Projects/Nitrodigest/README.md index b8ea879..b4a488e 100644 --- a/Projects/Nitrodigest/README.md +++ b/Projects/Nitrodigest/README.md @@ -34,6 +34,12 @@ nitrodigest > summary.md +``` + Summarize one file and save it to summary.md: ```bash diff --git a/Projects/Nitrodigest/src/cli/main.py b/Projects/Nitrodigest/src/cli/main.py index 1e30dd2..6882ad9 100644 --- a/Projects/Nitrodigest/src/cli/main.py +++ b/Projects/Nitrodigest/src/cli/main.py @@ -100,6 +100,11 @@ def main(): if not sys.stdin.isatty(): content = sys.stdin.read() process_text(content, summarizer, args.format, args.include_original) + elif not args.content: + current_dir = os.getcwd() + process_directory(current_dir, summarizer, + args.format, args.include_original) + else: if os.path.isfile(args.content): process_file(args.content, summarizer, From c0f9563c95e80a096da6f6e66e48aec19490a630 Mon Sep 17 00:00:00 2001 From: Marcin Kwiatkowski Date: Wed, 1 Oct 2025 07:35:33 +0200 Subject: [PATCH 4/5] feat: parallel processing --- .../Docs/Getting Started/Quickstart.md | 2 +- .../Summarizing All Files in a Directory.md | 84 +++++++-- ...NitroDigest \342\200\223 Documentation.md" | 2 +- Projects/Nitrodigest/README.md | 1 + Projects/Nitrodigest/setup.cfg | 1 + Projects/Nitrodigest/src/cli/main.py | 177 ++++++++++++++++-- Projects/Nitrodigest/src/cli/requirements.txt | 3 + 7 files changed, 236 insertions(+), 34 deletions(-) diff --git a/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md b/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md index a3a326c..fe1cc8f 100644 --- a/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md +++ b/Projects/Nitrodigest/Docs/Getting Started/Quickstart.md @@ -23,7 +23,7 @@ You can also run NitroDigest without any arguments to process all supported file nitrodigest ``` -This will automatically find and summarize all text files (`.txt`, `.md`, `.html`, `.json`, `.csv`, `.log`, etc.) in the current working directory. +This will automatically find and summarize all text files (`.txt`, `.md`, `.html`, `.json`, `.csv`, `.log`, etc.) in the current working directory. NitroDigest processes multiple files in parallel (up to 4 simultaneously by default) and shows a progress bar during processing. ## 3. Observe the output diff --git a/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md b/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md index e53aa39..9b32c27 100644 --- a/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md +++ b/Projects/Nitrodigest/Docs/Guides/Summarizing All Files in a Directory.md @@ -15,8 +15,9 @@ This command will: 1. Scan the directory and all subdirectories 2. Find all supported text files -3. Process each file individually using your default model -4. Output each summary to the terminal in sequence +3. Process multiple files in parallel using your default model (up to 4 files simultaneously by default) +4. Display a progress bar during processing +5. Output all summaries at the end ### Process Current Directory @@ -66,19 +67,22 @@ Will process `meeting-notes.txt`, `project-report.md`, `data-analysis.csv`, `web ### Terminal Output (Default) -By default, all summaries are displayed in your terminal one after another: +By default, all summaries are displayed in your terminal after processing completes: ```bash nitrodigest documents/ ``` -You'll see processing messages and formatted summaries for each file: +You'll see a progress bar during processing, followed by all summaries: ```bash Processing directory: documents/ -Processing file: documents/meeting-notes.txt -Generating summary for meeting-notes.txt... -2025-05-26 07:55:42,615 - cli.summarizer.base.OllamaSummarizer - INFO - Sending request to Ollama API using model mistral +Found 5 files to process with 4 workers + +Processing: 100%|████████████████████| 5/5 [00:12<00:00, 2.45s/file] ✓ more-notes.txt + +Processing complete: 5 successful, 0 failed + --- date: '2025-05-16 07:50:22' id: documents/meeting-notes.txt @@ -91,10 +95,15 @@ tokens: 189 -Processing file: documents/project-report.md -Generating summary for project-report.md... +================================================================================ + +--- +date: '2025-05-16 08:15:10' +id: documents/project-report.md ... -Directory processing complete: 4 of 4 files processed successfully +--- + + ``` ### Save All Summaries to One File @@ -149,12 +158,33 @@ project/ All three files (`overview.md`, `specifications.txt`, and `notes.txt`) will be processed. -### File Ordering +### Parallel Processing -Files are processed in the order they're discovered by the file system, which typically means: +NitroDigest processes multiple files simultaneously to improve performance. By default, it uses 4 parallel workers, meaning up to 4 files can be processed at the same time. -- Files in the main directory first -- Then files in subdirectories +#### Adjusting Parallel Workers + +You can control the number of parallel workers based on your system resources and needs: + +```bash +# Use 8 workers for faster processing (good for powerful systems) +nitrodigest documents/ --max-workers 8 + +# Use 2 workers for slower systems or to reduce resource usage +nitrodigest documents/ --max-workers 2 + +# Use 1 worker for sequential processing +nitrodigest documents/ --max-workers 1 +``` + +**When to adjust workers:** +- **Increase workers (6-8):** If you have a powerful system and want maximum speed +- **Decrease workers (1-2):** If you have limited RAM, CPU, or want to reduce system load +- **Keep default (4):** For most use cases, this provides a good balance + +### File Ordering + +Files are processed in parallel, so they may complete in a different order than discovered. However, all files in the directory and subdirectories will be processed. ## Practical Use Cases @@ -192,6 +222,20 @@ nitrodigest meeting_notes_march/ > march_meetings_summary.md ## Tips and Best Practices +### Performance Optimization + +For best performance when processing large directories: + +```bash +# Use more workers on powerful systems +nitrodigest large_directory/ --max-workers 8 > summaries.md + +# Monitor your system resources (CPU, RAM) and adjust workers accordingly +# If Ollama is running on the same machine, consider your model's resource needs +``` + +**Pro tip:** The optimal number of workers depends on your Ollama setup. If Ollama is using significant resources, fewer workers may actually be faster. + ### Organize Your Input Structure your directories logically before processing: @@ -253,6 +297,18 @@ If your directory contains specialized content, use a custom prompt: nitrodigest technical_docs/ --prompt "Summarize this technical document focusing on implementation details and requirements" > tech_summaries.md ``` +### Combining Parallel Processing with Other Options + +You can combine `--max-workers` with other options for optimized processing: + +```bash +# Fast processing with custom model and 8 workers +nitrodigest documents/ --model llama3 --max-workers 8 > summaries.md + +# Slower but thorough processing with 2 workers and custom prompt +nitrodigest research/ --max-workers 2 --prompt-file research_prompt.txt > research_summaries.md +``` + ## Next Steps - **[Custom Prompts](./Overriding%20Prompt%20Templates.md):** Explore Overriding Prompt Templates for specialized content diff --git "a/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" "b/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" index 86a9fc6..d6988ec 100644 --- "a/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" +++ "b/Projects/Nitrodigest/Docs/NitroDigest \342\200\223 Documentation.md" @@ -10,7 +10,7 @@ permalink: projects/nitrodigest/docs - **Local AI Summarization:** Uses Ollama to run LLMs on your machine, preserving privacy and working offline. - **Multiple Input Formats:** Supports plain text, Markdown, HTML, CSV, JSON, and other text-based files. - **Multiple Output Formats: By default NitroDigest returns Text, but for advanced processing it can return JSON. -- **Batch Processing:** Summarize a single file or all files in a directory in one command. +- **Parallel Batch Processing:** Summarize a single file or process multiple files in a directory simultaneously with configurable parallel workers for faster processing. - **Configurable Prompts:** Uses prompt templates that you can customize to change the style or content of summaries. - **Extensible:** Easily switch to different models (e.g., use a larger or domain-specific Ollama model) and adjust token budgets or segmentation for large inputs. diff --git a/Projects/Nitrodigest/README.md b/Projects/Nitrodigest/README.md index b4a488e..00b628d 100644 --- a/Projects/Nitrodigest/README.md +++ b/Projects/Nitrodigest/README.md @@ -71,6 +71,7 @@ Available arguments: - `--ollama_api_url`: URL of Ollama API (default: ) - `--format`: Output format. Can be `text` or `json` (default: text) - `--include-original`: Include original text in the summary output (default: False) +- `--max-workers`: Maximum number of parallel workers for directory processing (default: 4) ### Custom Prompt Configuration diff --git a/Projects/Nitrodigest/setup.cfg b/Projects/Nitrodigest/setup.cfg index bc8d019..a471f29 100644 --- a/Projects/Nitrodigest/setup.cfg +++ b/Projects/Nitrodigest/setup.cfg @@ -20,6 +20,7 @@ install_requires = pyyaml>=6.0 nltk>=3.9.1 emoji>=2.14.1 + tqdm>=4.67.1 [options.packages.find] where = src diff --git a/Projects/Nitrodigest/src/cli/main.py b/Projects/Nitrodigest/src/cli/main.py index 6882ad9..25aa3b3 100644 --- a/Projects/Nitrodigest/src/cli/main.py +++ b/Projects/Nitrodigest/src/cli/main.py @@ -5,6 +5,8 @@ import yaml from datetime import datetime import json +from concurrent.futures import ThreadPoolExecutor, as_completed +from tqdm import tqdm from .summarizer import ( OllamaSummarizer, @@ -61,6 +63,12 @@ def main(): action="store_true", help="Include original text in the summary output" ) + parser.add_argument( + "--max-workers", + type=int, + default=4, + help="Maximum number of parallel workers for directory processing (default: 4)" + ) args = parser.parse_args() @@ -103,7 +111,7 @@ def main(): elif not args.content: current_dir = os.getcwd() process_directory(current_dir, summarizer, - args.format, args.include_original) + args.format, args.include_original, args.max_workers) else: if os.path.isfile(args.content): @@ -111,7 +119,7 @@ def main(): args.format, args.include_original) elif os.path.isdir(args.content): process_directory(args.content, summarizer, - args.format, args.include_original) + args.format, args.include_original, args.max_workers) else: process_text(args.content, summarizer, args.format, args.include_original) @@ -142,7 +150,7 @@ def process_text(content: str, summarizer: OllamaSummarizer, format: str, includ def process_file(file_path, summarizer, format: str, include_original: bool): - """Process a single file for summarization""" + """Process a single file for summarization and print results""" try: logger.info(f"Processing file: {file_path}") @@ -168,31 +176,164 @@ def process_file(file_path, summarizer, format: str, include_original: bool): raise -def process_directory(directory_path, summarizer, format: str, include_original: bool): - """Process all text files in a directory for summarization""" - logger.info(f"Processing directory: {directory_path}") +def _process_file_return_result(file_path, summarizer, format: str, include_original: bool): + """Process a single file and return the result without printing""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + if not content.strip(): + return None + + file_name = os.path.basename(file_path) + metadata = { + 'title': file_name, + 'source': 'file://' + os.path.abspath(file_path), + 'date': datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d %H:%M:%S"), + 'id': file_path + } + + result = summarizer.summarize(content, metadata) - file_count = 0 - success_count = 0 + if not result.is_success(): + return None + return { + 'content': content, + 'metadata': metadata, + 'summary': result.summary, + 'model_used': result.model_used, + 'tokens_used': result.tokens_used, + 'file_path': file_path + } + + except Exception: + raise + + +def process_directory(directory_path, summarizer, format: str, include_original: bool, max_workers: int = 4): + """Process all text files in a directory with parallel processing and progress tracking""" + + files_to_process = [] for root, _, files in os.walk(directory_path): for filename in files: - # Only process text files - check common text file extensions if filename.lower().endswith(('.txt', '.md', '.html', '.htm', '.xml', '.json', '.csv', '.log')): file_path = os.path.join(root, filename) + files_to_process.append(file_path) + + file_count = len(files_to_process) + + if file_count == 0: + print("No text files found to process") + return + + print(f"\nProcessing directory: {directory_path}") + print(f"Found {file_count} files to process with {max_workers} workers\n") + + import logging + original_levels = {} + for log_name in ['cli.summarizer.base.OllamaSummarizer', 'cli.main']: + log = logging.getLogger(log_name) + original_levels[log_name] = log.level + log.setLevel(logging.WARNING) + + results = [] + errors = [] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_file = { + executor.submit(_process_file_return_result, file_path, summarizer, format, include_original): file_path + for file_path in files_to_process + } + + with tqdm( + total=file_count, + desc="Processing", + unit="file", + bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]', + leave=True, + position=0 + ) as pbar: + for future in as_completed(future_to_file): + file_path = future_to_file[future] + file_name = os.path.basename(file_path) + try: - process_file(file_path, summarizer, - format, include_original) - success_count += 1 - logger.info(f"File {success_count} processed successfully") + result = future.result() + if result: + results.append(result) + pbar.set_postfix_str( + f"✓ {file_name[:50]}", refresh=True) + else: + errors.append( + (file_path, "Empty file or failed to generate summary")) + pbar.set_postfix_str( + f"✗ {file_name[:50]}", refresh=True) except Exception as e: - logger.error( - f"Error when processing file {file_path}: {e}") + errors.append((file_path, str(e))) + pbar.set_postfix_str(f"✗ {file_name[:50]}", refresh=True) finally: - file_count += 1 + pbar.update(1) + + for log_name, level in original_levels.items(): + logging.getLogger(log_name).setLevel(level) + + print( + f"\nProcessing complete: {len(results)} successful, {len(errors)} failed\n") - logger.info( - f"Directory processing complete: {success_count} of {file_count} files processed successfully") + if errors: + print("Failed files:") + for file_path, error in errors: + print(f" - {os.path.basename(file_path)}: {error}") + print() + + for idx, result in enumerate(results, 1): + _print_result(result, format, include_original) + if idx < len(results): + print("\n" + "=" * 80 + "\n") + + +def _print_result(result, format: str, include_original: bool): + """Print a single result""" + metadata = result['metadata'] + summary = result['summary'] + content = result['content'] + + if format == 'text': + print('---') + yaml.dump( + { + 'title': metadata.get('title', 'Untitled'), + 'source': metadata.get('source', 'Unknown'), + 'date': metadata.get('date', datetime.now().strftime("%Y-%m-%d")), + 'id': metadata.get('id', ''), + 'summary_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + 'model': result['model_used'], + 'tokens': result['tokens_used'] + }, + sys.stdout, + default_flow_style=False, + allow_unicode=True + ) + print('---\n') + print(_json_to_text(summary)) + + if include_original: + print("\n---\n") + print("## Original Text\n") + print(content) + elif format == 'json': + json_summary = json.loads(summary) + json_summary["metadata"] = metadata + json_summary["model_used"] = result['model_used'] + json_summary["tokens_used"] = result['tokens_used'] + + if include_original: + json_summary["original_text"] = content + + print(json.dumps(json_summary, ensure_ascii=False, indent=2)) + else: + print(summary) def _generate_summary(content, summarizer, metadata, format, include_original=True) -> int: diff --git a/Projects/Nitrodigest/src/cli/requirements.txt b/Projects/Nitrodigest/src/cli/requirements.txt index adf5176..8023575 100644 --- a/Projects/Nitrodigest/src/cli/requirements.txt +++ b/Projects/Nitrodigest/src/cli/requirements.txt @@ -12,3 +12,6 @@ nltk>=3.9.1 # preprocessing utilities emoji>=2.14.1 + +# progress bar +tqdm>=4.67.1 From 7b520c3655684cda50567c46a1327d264ae56011 Mon Sep 17 00:00:00 2001 From: Marcin Kwiatkowski Date: Wed, 1 Oct 2025 07:41:39 +0200 Subject: [PATCH 5/5] chore: updated version --- Projects/Nitrodigest/setup.cfg | 2 +- Projects/Nitrodigest/src/cli/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Projects/Nitrodigest/setup.cfg b/Projects/Nitrodigest/setup.cfg index a471f29..fa64711 100644 --- a/Projects/Nitrodigest/setup.cfg +++ b/Projects/Nitrodigest/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = nitrodigest-cli -version = 0.2.0 +version = 0.3.0 author = Marcin Kwiatkowski author_email = marcin@frodigo.com description = The privacy‑first, local‑LLM text‑summariser for developers. diff --git a/Projects/Nitrodigest/src/cli/__init__.py b/Projects/Nitrodigest/src/cli/__init__.py index 76b1ae7..5043712 100644 --- a/Projects/Nitrodigest/src/cli/__init__.py +++ b/Projects/Nitrodigest/src/cli/__init__.py @@ -1,6 +1,6 @@ """nitrodigest CLI package""" -__version__ = "0.2.0" +__version__ = "0.3.0" from .main import main from .config import Config