diff --git a/Cargo.lock b/Cargo.lock index 3de43e685b..63e507e209 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,16 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "aead" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" +dependencies = [ + "crypto-common", + "generic-array", +] + [[package]] name = "aes" version = "0.8.4" @@ -19,6 +29,20 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "aes-gcm" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1" +dependencies = [ + "aead", + "aes", + "cipher", + "ctr", + "ghash", + "subtle", +] + [[package]] name = "ahash" version = "0.7.8" @@ -1522,6 +1546,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", + "rand_core 0.6.4", "typenum", ] @@ -1556,6 +1581,15 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "ctr" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835" +dependencies = [ + "cipher", +] + [[package]] name = "darling" version = "0.20.11" @@ -2497,7 +2531,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2639,7 +2673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2972,6 +3006,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "ghash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1" +dependencies = [ + "opaque-debug", + "polyval", +] + [[package]] name = "glob" version = "0.3.3" @@ -3312,7 +3356,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.1", "tokio", "tower-service", "tracing", @@ -3346,6 +3390,7 @@ dependencies = [ name = "iceberg" version = "0.8.0" dependencies = [ + "aes-gcm", "anyhow", "apache-avro 0.21.0", "array-init", @@ -3400,6 +3445,7 @@ dependencies = [ "typed-builder", "url", "uuid", + "zeroize", "zstd", ] @@ -3845,7 +3891,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -4366,7 +4412,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -4521,6 +4567,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "opaque-debug" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" + [[package]] name = "opendal" version = "0.55.0" @@ -4887,6 +4939,18 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "polyval" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25" +dependencies = [ + "cfg-if", + "cpufeatures", + "opaque-debug", + "universal-hash", +] + [[package]] name = "port_scanner" version = "0.1.5" @@ -5166,7 +5230,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.35", - "socket2 0.5.10", + "socket2 0.6.1", "thiserror 2.0.17", "tokio", "tracing", @@ -5203,9 +5267,9 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.1", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -5697,7 +5761,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -6699,7 +6763,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -7215,6 +7279,16 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" +[[package]] +name = "universal-hash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" +dependencies = [ + "crypto-common", + "subtle", +] + [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -7514,7 +7588,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 517bfa36e8..1a91ba3bca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ repository = "https://github.com/apache/iceberg-rust" rust-version = "1.88" [workspace.dependencies] +aes-gcm = "0.10" anyhow = "1.0.72" apache-avro = { version = "0.21", features = ["zstandard"] } array-init = "2" @@ -131,5 +132,6 @@ typed-builder = "0.20" url = "2.5.7" uuid = { version = "1.18", features = ["v7"] } volo = "0.10.6" +zeroize = "1.7" volo-thrift = "0.10.8" zstd = "0.13.3" \ No newline at end of file diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 6f1332a444..02daf4c860 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -41,6 +41,7 @@ storage-s3 = ["opendal/services-s3", "reqsign"] [dependencies] +aes-gcm = { workspace = true } anyhow = { workspace = true } apache-avro = { workspace = true } array-init = { workspace = true } @@ -88,6 +89,7 @@ tokio = { workspace = true, optional = false, features = ["sync"] } typed-builder = { workspace = true } url = { workspace = true } uuid = { workspace = true } +zeroize = { workspace = true } zstd = { workspace = true } [dev-dependencies] diff --git a/crates/iceberg/src/encryption/crypto.rs b/crates/iceberg/src/encryption/crypto.rs new file mode 100644 index 0000000000..9480d4d984 --- /dev/null +++ b/crates/iceberg/src/encryption/crypto.rs @@ -0,0 +1,381 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Core cryptographic operations for Iceberg encryption. + +use std::str::FromStr; + +use aes_gcm::aead::{Aead, AeadCore, KeyInit, OsRng, Payload}; +use aes_gcm::{Aes128Gcm, Key, Nonce}; +use zeroize::Zeroizing; + +use crate::{Error, ErrorKind, Result}; + +/// Supported encryption algorithm. +/// Currently only AES-128-GCM is supported as it's the only algorithm +/// compatible with arrow-rs Parquet encryption. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EncryptionAlgorithm { + /// AES-128 in GCM mode + Aes128Gcm, +} + +impl EncryptionAlgorithm { + /// Returns the key length in bytes for this algorithm. + pub fn key_length(&self) -> usize { + match self { + Self::Aes128Gcm => 16, + } + } + + /// Returns the nonce/IV length in bytes for this algorithm. + pub fn nonce_length(&self) -> usize { + 12 // GCM uses 96-bit nonces + } + + /// Returns the string identifier for this algorithm. + pub fn as_str(&self) -> &'static str { + match self { + Self::Aes128Gcm => "AES_GCM_128", + } + } +} + +impl FromStr for EncryptionAlgorithm { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "AES_GCM_128" | "AES128_GCM" => Ok(Self::Aes128Gcm), + _ => Err(Error::new( + ErrorKind::DataInvalid, + format!("Unsupported encryption algorithm: {s}"), + )), + } + } +} + +/// A secure encryption key that zeroes its memory on drop. +#[derive(Debug)] +pub struct SecureKey { + key: Zeroizing>, + algorithm: EncryptionAlgorithm, +} + +impl SecureKey { + /// Creates a new secure key with the specified algorithm. + /// + /// # Errors + /// Returns an error if the key length doesn't match the algorithm requirements. + pub fn new(key: Vec, algorithm: EncryptionAlgorithm) -> Result { + if key.len() != algorithm.key_length() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid key length for {:?}: expected {} bytes, got {}", + algorithm, + algorithm.key_length(), + key.len() + ), + )); + } + Ok(Self { + key: Zeroizing::new(key), + algorithm, + }) + } + + /// Generates a new random key for the specified algorithm. + pub fn generate(algorithm: EncryptionAlgorithm) -> Self { + let mut key = vec![0u8; algorithm.key_length()]; + use rand::RngCore; + OsRng.fill_bytes(&mut key); + Self { + key: Zeroizing::new(key), + algorithm, + } + } + + /// Returns the encryption algorithm for this key. + pub fn algorithm(&self) -> EncryptionAlgorithm { + self.algorithm + } + + /// Returns the key bytes. + pub fn as_bytes(&self) -> &[u8] { + &self.key + } +} + +/// AES-GCM encryptor for encrypting and decrypting data. +pub struct AesGcmEncryptor { + key: SecureKey, +} + +impl AesGcmEncryptor { + /// Creates a new encryptor with the specified key. + pub fn new(key: SecureKey) -> Self { + Self { key } + } + + /// Encrypts data using AES-GCM. + /// + /// # Arguments + /// * `plaintext` - The data to encrypt + /// * `aad` - Additional authenticated data (optional) + /// + /// # Returns + /// The encrypted data in the format: [12-byte nonce][ciphertext][16-byte auth tag] + /// This matches the Java implementation format for compatibility. + pub fn encrypt(&self, plaintext: &[u8], aad: Option<&[u8]>) -> Result> { + match self.key.algorithm() { + EncryptionAlgorithm::Aes128Gcm => self.encrypt_aes128_gcm(plaintext, aad), + } + } + + /// Decrypts data using AES-GCM. + /// + /// # Arguments + /// * `ciphertext` - The encrypted data with format: [12-byte nonce][encrypted data][16-byte auth tag] + /// * `aad` - Additional authenticated data (must match encryption) + /// + /// # Returns + /// The decrypted plaintext. + pub fn decrypt(&self, ciphertext: &[u8], aad: Option<&[u8]>) -> Result> { + const NONCE_LEN: usize = 12; + const TAG_LEN: usize = 16; + + if ciphertext.len() < NONCE_LEN + TAG_LEN { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Ciphertext too short: expected at least {} bytes, got {}", + NONCE_LEN + TAG_LEN, + ciphertext.len() + ), + )); + } + + let nonce = &ciphertext[..NONCE_LEN]; + let encrypted_data = &ciphertext[NONCE_LEN..]; + match self.key.algorithm() { + EncryptionAlgorithm::Aes128Gcm => self.decrypt_aes128_gcm(nonce, encrypted_data, aad), + } + } + + fn encrypt_aes128_gcm(&self, plaintext: &[u8], aad: Option<&[u8]>) -> Result> { + let key = Key::::from_slice(self.key.as_bytes()); + let cipher = Aes128Gcm::new(key); + let nonce = Aes128Gcm::generate_nonce(&mut OsRng); + + let ciphertext = if let Some(aad) = aad { + let payload = Payload { + msg: plaintext, + aad, + }; + cipher.encrypt(&nonce, payload).map_err(|e| { + Error::new(ErrorKind::Unexpected, "AES-128-GCM encryption failed") + .with_source(anyhow::anyhow!(e)) + })? + } else { + cipher.encrypt(&nonce, plaintext).map_err(|e| { + Error::new(ErrorKind::Unexpected, "AES-128-GCM encryption failed") + .with_source(anyhow::anyhow!(e)) + })? + }; + + // Prepend nonce to ciphertext (Java compatible format) + let mut result = Vec::with_capacity(nonce.len() + ciphertext.len()); + result.extend_from_slice(&nonce); + result.extend_from_slice(&ciphertext); + Ok(result) + } + + fn decrypt_aes128_gcm( + &self, + nonce: &[u8], + ciphertext: &[u8], + aad: Option<&[u8]>, + ) -> Result> { + let key = Key::::from_slice(self.key.as_bytes()); + let cipher = Aes128Gcm::new(key); + let nonce = Nonce::from_slice(nonce); + + let plaintext = if let Some(aad) = aad { + let payload = Payload { + msg: ciphertext, + aad, + }; + cipher.decrypt(nonce, payload).map_err(|e| { + Error::new(ErrorKind::Unexpected, "AES-128-GCM decryption failed") + .with_source(anyhow::anyhow!(e)) + })? + } else { + cipher.decrypt(nonce, ciphertext).map_err(|e| { + Error::new(ErrorKind::Unexpected, "AES-128-GCM decryption failed") + .with_source(anyhow::anyhow!(e)) + })? + }; + + Ok(plaintext) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encryption_algorithm() { + assert_eq!(EncryptionAlgorithm::Aes128Gcm.key_length(), 16); + assert_eq!(EncryptionAlgorithm::Aes128Gcm.nonce_length(), 12); + + assert_eq!( + EncryptionAlgorithm::from_str("AES_GCM_128").unwrap(), + EncryptionAlgorithm::Aes128Gcm + ); + assert_eq!( + EncryptionAlgorithm::from_str("AES128_GCM").unwrap(), + EncryptionAlgorithm::Aes128Gcm + ); + + assert!(EncryptionAlgorithm::from_str("INVALID").is_err()); + assert!(EncryptionAlgorithm::from_str("AES_GCM_256").is_err()); + assert!(EncryptionAlgorithm::from_str("AES256_GCM").is_err()); + + assert_eq!(EncryptionAlgorithm::Aes128Gcm.as_str(), "AES_GCM_128"); + } + + #[test] + fn test_secure_key() { + // Test key generation + let key1 = SecureKey::generate(EncryptionAlgorithm::Aes128Gcm); + assert_eq!(key1.as_bytes().len(), 16); + assert_eq!(key1.algorithm(), EncryptionAlgorithm::Aes128Gcm); + + // Test key creation with validation + let valid_key = vec![0u8; 16]; + assert!(SecureKey::new(valid_key, EncryptionAlgorithm::Aes128Gcm).is_ok()); + + let invalid_key = vec![0u8; 32]; + assert!(SecureKey::new(invalid_key, EncryptionAlgorithm::Aes128Gcm).is_err()); + } + + #[test] + fn test_aes128_gcm_encryption_roundtrip() { + let key = SecureKey::generate(EncryptionAlgorithm::Aes128Gcm); + let encryptor = AesGcmEncryptor::new(key); + + let plaintext = b"Hello, Iceberg encryption!"; + let aad = b"additional authenticated data"; + + // Test without AAD + let ciphertext = encryptor.encrypt(plaintext, None).unwrap(); + assert!(ciphertext.len() > plaintext.len() + 12); // nonce + tag + assert_ne!(&ciphertext[12..], plaintext); // encrypted portion differs + + let decrypted = encryptor.decrypt(&ciphertext, None).unwrap(); + assert_eq!(decrypted, plaintext); + + // Test with AAD + let ciphertext = encryptor.encrypt(plaintext, Some(aad)).unwrap(); + let decrypted = encryptor.decrypt(&ciphertext, Some(aad)).unwrap(); + assert_eq!(decrypted, plaintext); + + // Test with wrong AAD fails + assert!(encryptor.decrypt(&ciphertext, Some(b"wrong aad")).is_err()); + } + + #[test] + fn test_encryption_with_empty_plaintext() { + let key = SecureKey::generate(EncryptionAlgorithm::Aes128Gcm); + let encryptor = AesGcmEncryptor::new(key); + + let plaintext = b""; + let ciphertext = encryptor.encrypt(plaintext, None).unwrap(); + + // Even empty plaintext produces nonce + tag + assert_eq!(ciphertext.len(), 12 + 16); // 12-byte nonce + 16-byte tag + + let decrypted = encryptor.decrypt(&ciphertext, None).unwrap(); + assert_eq!(decrypted, plaintext); + } + + #[test] + fn test_decryption_with_tampered_ciphertext() { + let key = SecureKey::generate(EncryptionAlgorithm::Aes128Gcm); + let encryptor = AesGcmEncryptor::new(key); + + let plaintext = b"Sensitive data"; + let mut ciphertext = encryptor.encrypt(plaintext, None).unwrap(); + + // Tamper with the encrypted portion (after the nonce) + if ciphertext.len() > 12 { + ciphertext[12] ^= 0xFF; + } + + // Decryption should fail due to authentication tag mismatch + assert!(encryptor.decrypt(&ciphertext, None).is_err()); + } + + #[test] + fn test_different_keys_produce_different_ciphertexts() { + let key1 = SecureKey::generate(EncryptionAlgorithm::Aes128Gcm); + let key2 = SecureKey::generate(EncryptionAlgorithm::Aes128Gcm); + + let encryptor1 = AesGcmEncryptor::new(key1); + let encryptor2 = AesGcmEncryptor::new(key2); + + let plaintext = b"Same plaintext"; + + let ciphertext1 = encryptor1.encrypt(plaintext, None).unwrap(); + let ciphertext2 = encryptor2.encrypt(plaintext, None).unwrap(); + + // Different keys should produce different ciphertexts (comparing the encrypted portion) + // Note: The nonces will also be different, but we're mainly interested in the encrypted data + assert_ne!(&ciphertext1[12..], &ciphertext2[12..]); + } + + #[test] + fn test_ciphertext_format_java_compatible() { + // Test that our ciphertext format matches Java's: [12-byte nonce][ciphertext][16-byte tag] + let key = SecureKey::generate(EncryptionAlgorithm::Aes128Gcm); + let encryptor = AesGcmEncryptor::new(key); + + let plaintext = b"Test data"; + let ciphertext = encryptor.encrypt(plaintext, None).unwrap(); + + // Format should be: [12-byte nonce][encrypted_data + 16-byte GCM tag] + assert_eq!( + ciphertext.len(), + 12 + plaintext.len() + 16, + "Ciphertext should be nonce + plaintext + tag length" + ); + + // Verify we can decrypt by extracting nonce from the beginning + let nonce = &ciphertext[..12]; + assert_eq!(nonce.len(), 12, "Nonce should be 12 bytes"); + + // The rest is encrypted data + tag + let encrypted_with_tag = &ciphertext[12..]; + assert_eq!( + encrypted_with_tag.len(), + plaintext.len() + 16, + "Encrypted portion should be plaintext length + 16-byte tag" + ); + } +} diff --git a/crates/iceberg/src/encryption/mod.rs b/crates/iceberg/src/encryption/mod.rs new file mode 100644 index 0000000000..496209d591 --- /dev/null +++ b/crates/iceberg/src/encryption/mod.rs @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Encryption module for Apache Iceberg. +//! +//! This module provides core cryptographic primitives for encrypting +//! and decrypting data in Iceberg tables. + +mod crypto; + +pub use crypto::{AesGcmEncryptor, EncryptionAlgorithm, SecureKey}; diff --git a/crates/iceberg/src/lib.rs b/crates/iceberg/src/lib.rs index 8d8f40f72d..4a36390a48 100644 --- a/crates/iceberg/src/lib.rs +++ b/crates/iceberg/src/lib.rs @@ -91,6 +91,7 @@ mod runtime; pub mod arrow; pub(crate) mod delete_file_index; +pub mod encryption; pub mod test_utils; mod utils; pub mod writer;