diff --git a/src/web/format_detection.rs b/src/web/format_detection.rs index b6b7072..6aaa07a 100644 --- a/src/web/format_detection.rs +++ b/src/web/format_detection.rs @@ -444,6 +444,40 @@ pub fn parse_binary_file( } } +/// Parse a binary file directly from a file path (for streamed uploads). +/// +/// Unlike [`parse_binary_file`], this function does not need to create a temporary file — +/// the caller already has one. The file may be truncated after the header; only the +/// header portion is needed for BAM/CRAM. +/// +/// # Errors +/// +/// Returns `ParseError::ParseFailed` if parsing fails, or if the format is not a +/// supported binary format. +pub fn parse_binary_file_from_path( + path: &std::path::Path, + format: FileFormat, +) -> Result { + match format { + FileFormat::Bam | FileFormat::Cram => { + crate::parsing::sam::parse_file(path).map_err(|e| ParseError::ParseFailed { + format, + message: format!("Binary file parsing failed: {e}"), + }) + } + FileFormat::Fasta => { + crate::parsing::fasta::parse_fasta_file(path).map_err(|e| ParseError::ParseFailed { + format, + message: format!("FASTA file parsing failed: {e}"), + }) + } + _ => Err(ParseError::ParseFailed { + format, + message: "Format is not a binary file format".to_string(), + }), + } +} + #[cfg(test)] mod tests { use super::*; @@ -616,4 +650,77 @@ mod tests { assert_eq!(query.contigs[0].name, "chr1"); assert_eq!(query.contigs[0].length, 248_956_422); } + + /// Helper: build a minimal BAM byte buffer from a SAM header string. + fn build_bam_bytes(header_text: &str) -> Vec { + use noodles::bam; + use noodles::sam; + + let mut reader = sam::io::Reader::new(header_text.as_bytes()); + let header = reader.read_header().unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = bam::io::Writer::new(&mut buf); + writer.write_header(&header).unwrap(); + } + buf + } + + #[test] + fn test_parse_binary_file_from_path_bam() { + use std::io::Write; + use tempfile::NamedTempFile; + + let bam_bytes = build_bam_bytes( + "@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n@SQ\tSN:chr2\tLN:242193529\n", + ); + + let mut temp = NamedTempFile::with_suffix(".bam").unwrap(); + temp.write_all(&bam_bytes).unwrap(); + + let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam); + assert!(result.is_ok()); + let query = result.unwrap(); + assert_eq!(query.contigs.len(), 2); + assert_eq!(query.contigs[0].name, "chr1"); + assert_eq!(query.contigs[0].length, 248_956_422); + assert_eq!(query.contigs[1].name, "chr2"); + assert_eq!(query.contigs[1].length, 242_193_529); + } + + #[test] + fn test_parse_binary_file_from_path_truncated_bam() { + // Verify that parsing works on a BAM file truncated after the header. + // This simulates the server-side streaming behavior where only the first + // N bytes of a large BAM are written to a temp file. + use std::io::Write; + use tempfile::NamedTempFile; + + let mut bam_bytes = build_bam_bytes("@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:248956422\n"); + + // Append junk data to simulate a truncated file (records cut off mid-stream) + bam_bytes.extend_from_slice(&[0u8; 1024]); + + let mut temp = NamedTempFile::with_suffix(".bam").unwrap(); + temp.write_all(&bam_bytes).unwrap(); + + let result = parse_binary_file_from_path(temp.path(), FileFormat::Bam); + assert!(result.is_ok()); + let query = result.unwrap(); + assert_eq!(query.contigs.len(), 1); + assert_eq!(query.contigs[0].name, "chr1"); + } + + #[test] + fn test_parse_binary_file_from_path_unsupported_format() { + use std::io::Write; + use tempfile::NamedTempFile; + + let mut temp = NamedTempFile::with_suffix(".txt").unwrap(); + temp.write_all(b"not a binary file").unwrap(); + + let result = parse_binary_file_from_path(temp.path(), FileFormat::Sam); + assert!(result.is_err()); + } } diff --git a/src/web/server.rs b/src/web/server.rs index 9d1e934..6af85da 100644 --- a/src/web/server.rs +++ b/src/web/server.rs @@ -24,18 +24,22 @@ use crate::matching::engine::{MatchingConfig, MatchingEngine, ScoringWeights}; use crate::matching::Suggestion; use crate::utils::validation::{validate_upload, ValidationError}; use crate::web::format_detection::{ - detect_format, parse_binary_file, parse_with_format, FileFormat, + detect_format, parse_binary_file, parse_binary_file_from_path, parse_with_format, FileFormat, }; /// Security configuration constants to prevent `DoS` attacks pub const MAX_MULTIPART_FIELDS: usize = 10; -pub const MAX_FILE_FIELD_SIZE: usize = 16 * 1024 * 1024; // 16MB +pub const MAX_FILE_FIELD_SIZE: usize = 16 * 1024 * 1024; // 16MB for text files pub const MAX_TEXT_FIELD_SIZE: usize = 1024 * 1024; // 1MB -/// Maximum bytes to read from a binary file upload for header extraction. -/// BAM/CRAM headers are typically under 500KB; 4MB provides generous headroom -/// for reference genomes with many contigs (e.g. >100,000 alt contigs). -pub const MAX_BINARY_HEADER_SIZE: usize = 4 * 1024 * 1024; +/// Maximum bytes to read from a binary upload before attempting header parse. +/// BAM/CRAM headers are typically < 1 MB; 64 MB provides generous headroom. +pub const BINARY_HEADER_READ_LIMIT: usize = 64 * 1024 * 1024; // 64MB + +/// Axum body limit — raised to allow large binary uploads to stream through. +/// This does not cause memory bloat because `field.chunk()` reads lazily from +/// the underlying HTTP body stream. +const MAX_BODY_SIZE: usize = 256 * 1024 * 1024; // 256MB /// Helper function to convert usize count to f64 with explicit precision loss allowance #[inline] @@ -52,13 +56,22 @@ pub struct AppState { pub refget_config: Option, } +/// Binary content from an upload, either fully buffered or streamed to a temp file. +#[derive(Debug)] +enum BinaryContent { + /// Small binary file fully buffered in memory (e.g. from a non-binary-format upload) + InMemory(Vec), + /// Large binary file streamed to a temp file (BAM/CRAM uploads) + TempFile(tempfile::NamedTempFile), +} + /// Input data extracted from multipart form #[derive(Debug)] struct InputData { /// Text content (if provided via textarea or text file) text_content: Option, /// Binary file content (if provided) - binary_content: Option>, + binary_content: Option, /// Original filename filename: Option, /// Detected or specified format @@ -236,8 +249,9 @@ pub fn create_router(refget_config: Option) -> anyh )) // Limit concurrent requests to prevent DOS .layer(ConcurrencyLimitLayer::new(100)) - // Limit request body size (accommodate largest file + multipart overhead) - .layer(DefaultBodyLimit::max(20 * 1024 * 1024)), // 20MB limit + // Limit request body size — raised for binary streaming; actual read + // limits are enforced per-field in extract_request_data() + .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)), ); Ok(app) @@ -845,111 +859,117 @@ async fn extract_request_data( "file" => { let filename = field.file_name().map(std::string::ToString::to_string); - match field.bytes().await { - Ok(bytes) => { - // Detect format from filename first to pick the right size limit - let detected_format = if let Some(ref name) = filename { - detect_binary_format(name).unwrap_or(FileFormat::Auto) - } else { - FileFormat::Auto - }; - - // Use stricter limit for binary files (only headers needed). - // FASTA is intentionally excluded: unlike BAM/CRAM where we - // only need the header, FASTA files require full sequences - // to compute contig lengths. - let max_size = if matches!( - detected_format, - FileFormat::Bam | FileFormat::Cram - ) { - MAX_BINARY_HEADER_SIZE - } else { - MAX_FILE_FIELD_SIZE - }; - - // Validate field size before processing - if bytes.len() > max_size { - return Err(( - StatusCode::PAYLOAD_TOO_LARGE, - Json(ErrorResponse { - error: "File size exceeds limit".to_string(), - error_type: ErrorType::FileTooLarge, - details: None, - }), - ) - .into_response()); + // Detect format from filename before reading the body + let detected_format = if let Some(ref name) = filename { + detect_binary_format(name).unwrap_or(FileFormat::Auto) + } else { + FileFormat::Auto + }; + + // For BAM/CRAM: stream chunks to a temp file (header-only read) + if matches!(detected_format, FileFormat::Bam | FileFormat::Cram) { + match read_binary_chunks(field, detected_format).await { + Ok((temp_file, _bytes_read)) => { + input_data.filename = filename; + input_data.binary_content = + Some(BinaryContent::TempFile(temp_file)); + input_data.format = Some(detected_format); } - - // Use comprehensive validation function for security - match validate_upload(filename.as_deref(), &bytes, detected_format) - { - Ok(validated_filename) => { - input_data.filename = validated_filename; - - // Detect if content is binary or text - if is_binary_content(&bytes) { - input_data.binary_content = Some(bytes.to_vec()); - input_data.format = Some(detected_format); - } else { - input_data.text_content = - Some(String::from_utf8_lossy(&bytes).to_string()); - } - } - Err(ValidationError::FilenameTooLong) => { - return Err(( - StatusCode::BAD_REQUEST, - Json(create_safe_error_response( - ErrorType::FilenameTooLong, - "Filename exceeds maximum length limit", - Some("Filename validation failed due to length constraints") - )), - ).into_response()); - } - Err(ValidationError::InvalidFilename) => { - return Err(( - StatusCode::BAD_REQUEST, - Json(create_safe_error_response( - ErrorType::InvalidFilename, - "Filename contains invalid or dangerous characters", - Some("Filename validation failed due to invalid characters") - )), - ).into_response()); - } - Err(ValidationError::FormatValidationFailed) => { - return Err(( - StatusCode::BAD_REQUEST, - Json(create_safe_error_response( - ErrorType::FormatMismatch, - "File content does not match the expected format based on filename", - Some("Format validation failed") - )), - ).into_response()); - } - Err(ValidationError::InvalidFileContent) => { + Err(err_response) => return Err(err_response), + } + } else { + // Text and other formats: buffer fully in memory + match field.bytes().await { + Ok(bytes) => { + // Validate field size before processing + if bytes.len() > MAX_FILE_FIELD_SIZE { return Err(( - StatusCode::BAD_REQUEST, - Json(create_safe_error_response( - ErrorType::InvalidContent, - "File content appears malformed or corrupted", - None, - )), + StatusCode::PAYLOAD_TOO_LARGE, + Json(ErrorResponse { + error: "File size exceeds limit".to_string(), + error_type: ErrorType::FileTooLarge, + details: None, + }), ) .into_response()); } - Err(_) => { - return Err(( - StatusCode::BAD_REQUEST, - Json(create_safe_error_response( - ErrorType::ValidationFailed, - "File validation failed", - None, - )), - ) - .into_response()); + + // Use comprehensive validation function for security + match validate_upload( + filename.as_deref(), + &bytes, + detected_format, + ) { + Ok(validated_filename) => { + input_data.filename = validated_filename; + + // Detect if content is binary or text + if is_binary_content(&bytes) { + input_data.binary_content = + Some(BinaryContent::InMemory(bytes.to_vec())); + input_data.format = Some(detected_format); + } else { + input_data.text_content = Some( + String::from_utf8_lossy(&bytes).to_string(), + ); + } + } + Err(ValidationError::FilenameTooLong) => { + return Err(( + StatusCode::BAD_REQUEST, + Json(create_safe_error_response( + ErrorType::FilenameTooLong, + "Filename exceeds maximum length limit", + Some("Filename validation failed due to length constraints") + )), + ).into_response()); + } + Err(ValidationError::InvalidFilename) => { + return Err(( + StatusCode::BAD_REQUEST, + Json(create_safe_error_response( + ErrorType::InvalidFilename, + "Filename contains invalid or dangerous characters", + Some("Filename validation failed due to invalid characters") + )), + ).into_response()); + } + Err(ValidationError::FormatValidationFailed) => { + return Err(( + StatusCode::BAD_REQUEST, + Json(create_safe_error_response( + ErrorType::FormatMismatch, + "File content does not match the expected format based on filename", + Some("Format validation failed") + )), + ).into_response()); + } + Err(ValidationError::InvalidFileContent) => { + return Err(( + StatusCode::BAD_REQUEST, + Json(create_safe_error_response( + ErrorType::InvalidContent, + "File content appears malformed or corrupted", + None, + )), + ) + .into_response()); + } + Err(_) => { + return Err(( + StatusCode::BAD_REQUEST, + Json(create_safe_error_response( + ErrorType::ValidationFailed, + "File validation failed", + None, + )), + ) + .into_response()); + } } } + Err(_) => had_parse_error = true, } - Err(_) => had_parse_error = true, } } "header_text" => match field.text().await { @@ -1083,7 +1103,12 @@ fn parse_input_data( // Binary file parsing let format = input_data.format.unwrap_or(FileFormat::Bam); - match parse_binary_file(binary_content, format) { + let result = match binary_content { + BinaryContent::InMemory(bytes) => parse_binary_file(bytes, format), + BinaryContent::TempFile(temp) => parse_binary_file_from_path(temp.path(), format), + }; + + match result { Ok(query) => Ok((query, Vec::new())), Err(_) => Err(Box::new(( StatusCode::BAD_REQUEST, @@ -1110,6 +1135,94 @@ fn parse_input_data( } } +/// Read a binary file upload in chunks, writing to a temp file. +/// +/// Stops after [`BINARY_HEADER_READ_LIMIT`] bytes — enough for any BAM/CRAM header. +/// Returns the temp file (kept alive for parsing) and total bytes written. +async fn read_binary_chunks( + mut field: axum::extract::multipart::Field<'_>, + format: FileFormat, +) -> Result<(tempfile::NamedTempFile, usize), Response> { + use std::io::Write; + + let extension = match format { + FileFormat::Bam => ".bam", + FileFormat::Cram => ".cram", + _ => ".bin", + }; + + let mut temp_file = tempfile::NamedTempFile::with_suffix(extension).map_err(|e| { + tracing::error!("Failed to create temp file for binary upload: {e}"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: "Internal error processing upload".to_string(), + error_type: ErrorType::InternalError, + details: None, + }), + ) + .into_response() + })?; + + let mut bytes_written: usize = 0; + + loop { + match field.chunk().await { + Ok(Some(chunk)) => { + let remaining = BINARY_HEADER_READ_LIMIT.saturating_sub(bytes_written); + if remaining == 0 { + break; + } + + let to_write = chunk.len().min(remaining); + temp_file.write_all(&chunk[..to_write]).map_err(|e| { + tracing::error!("Failed to write binary upload to temp file: {e}"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: "Internal error processing upload".to_string(), + error_type: ErrorType::InternalError, + details: None, + }), + ) + .into_response() + })?; + bytes_written += to_write; + + if to_write < chunk.len() { + break; // Hit the limit mid-chunk + } + } + Ok(None) => break, // End of field + Err(_) => { + return Err(( + StatusCode::BAD_REQUEST, + Json(create_safe_error_response( + ErrorType::InvalidContent, + "Failed to read uploaded file", + Some("Error reading multipart chunk during binary upload"), + )), + ) + .into_response()); + } + } + } + + if bytes_written == 0 { + return Err(( + StatusCode::BAD_REQUEST, + Json(create_safe_error_response( + ErrorType::MissingInput, + "Uploaded file is empty", + None, + )), + ) + .into_response()); + } + + Ok((temp_file, bytes_written)) +} + /// Check if content appears to be binary fn is_binary_content(bytes: &[u8]) -> bool { // Simple heuristic: if more than 1% of first 1024 bytes are non-printable, consider binary diff --git a/src/web/static/js/main.js b/src/web/static/js/main.js index 761b3fc..a0a08a7 100644 --- a/src/web/static/js/main.js +++ b/src/web/static/js/main.js @@ -14,7 +14,6 @@ import { validateFileSize, formatFileSize, clamp, - MAX_FILE_SIZE, MAX_TEXT_FILE_SIZE } from './utils/helpers.js'; import { extractBamHeader, isBamFile } from './utils/headerExtractor.js'; @@ -255,18 +254,19 @@ async function handleFileUpload(input, format) { } catch (err) { console.warn('Client-side BAM header extraction failed, falling back to upload:', err); hideExtractionStatus(); - // Fall through to normal upload + // Fall through to server-side streaming upload } } - // Validate file size based on format - const maxSize = format === 'binary' ? MAX_FILE_SIZE : MAX_TEXT_FILE_SIZE; - const validation = validateFileSize(file, maxSize); - - if (!validation.valid) { - showUploadError(validation.error); - input.value = ''; // Clear the file input - return; + // Validate file size for text formats only; binary uploads are streamed + // server-side (only the header is read), so no client-side limit is needed. + if (format !== 'binary') { + const validation = validateFileSize(file, MAX_TEXT_FILE_SIZE); + if (!validation.valid) { + showUploadError(validation.error); + input.value = ''; // Clear the file input + return; + } } tabManager.currentFile = file; diff --git a/src/web/static/js/utils/headerExtractor.js b/src/web/static/js/utils/headerExtractor.js index c79850b..5b90f2d 100644 --- a/src/web/static/js/utils/headerExtractor.js +++ b/src/web/static/js/utils/headerExtractor.js @@ -6,7 +6,7 @@ * @module utils/headerExtractor */ -import { MAX_FILE_SIZE } from './helpers.js'; +import { MAX_BAM_HEADER_READ } from './helpers.js'; /** * Concatenate an array of Uint8Array chunks into a single Uint8Array. @@ -137,7 +137,7 @@ async function decompressBgzfBlocks(bytes, neededBytes = 0) { * @throws {Error} If the file is not a valid BAM or extraction fails */ export async function extractBamHeader(file) { - const readSize = Math.min(file.size, MAX_FILE_SIZE); + const readSize = Math.min(file.size, MAX_BAM_HEADER_READ); const buffer = await readFileSlice(file, 0, readSize); const bytes = new Uint8Array(buffer); diff --git a/src/web/static/js/utils/helpers.js b/src/web/static/js/utils/helpers.js index 82557b5..1317d4c 100644 --- a/src/web/static/js/utils/helpers.js +++ b/src/web/static/js/utils/helpers.js @@ -101,16 +101,18 @@ MT\tassembled-molecule\tMT\tMitochondrion\tJ01415.2\t=\tNC_012920.1\tnon-nuclear }; /** - * Maximum file size for uploads (4MB) + * Maximum bytes to read for client-side BAM header extraction (64MB). + * BAM headers are typically well under 1MB of compressed BGZF blocks, + * so 64MB is more than sufficient for even the largest headers. * @constant {number} */ -export const MAX_FILE_SIZE = 4 * 1024 * 1024; // 4MB - only headers needed for BAM/CRAM +export const MAX_BAM_HEADER_READ = 64 * 1024 * 1024; /** - * Maximum file size for text files (1MB) + * Maximum file size for text file uploads (16MB) * @constant {number} */ -export const MAX_TEXT_FILE_SIZE = 1024 * 1024; +export const MAX_TEXT_FILE_SIZE = 16 * 1024 * 1024; /** * Debounce delay in milliseconds diff --git a/src/web/templates/index.html b/src/web/templates/index.html index 5245a98..5e5e706 100644 --- a/src/web/templates/index.html +++ b/src/web/templates/index.html @@ -147,7 +147,8 @@

Input & Configuration

BAM: Headers are extracted in your browser — large files are fine.
- CRAM/FASTA: Upload limit is 4MB (headers only). + CRAM: Only the header is read from the upload stream — large files are supported.
+ FASTA: Full upload required for contig lengths. Text files are limited to 16 MB.
@@ -155,7 +156,7 @@

Input & Configuration