ekmpa · hussien · Dec 11, 2025 · Dec 29, 2025 · Jan 14, 2026 · Jan 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 #Ours
 data/
+plots/
 results/
 saved_graphs/
 spark-warehouse/
@@ -93,6 +94,10 @@ docs/_build/
 # PyBuilder
 .pybuilder/
 target/
+# parquet files
+*.parquet
+# log files
+*.log*
 
 # Jupyter Notebook
 .ipynb_checkpoints
@@ -191,3 +196,8 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+bash_scripts/jdk-25_linux-x64_bin.tar
+bash_scripts/jdk-17.0.12_linux-x64_bin.tar.gz
+bash_scripts/jdk-25_linux-x64_bin.tar.gz
+uv.lock
+tgrag/distribution/kl_divergiance.py
diff --git a/bash_scripts/CC-Crawls/Feb2025.txt b/bash_scripts/CC-Crawls/Feb2025.txt
@@ -0,0 +1 @@
+CC-MAIN-2025-08
diff --git a/bash_scripts/CC-Crawls/Jan2025.txt b/bash_scripts/CC-Crawls/Jan2025.txt
@@ -0,0 +1 @@
+CC-MAIN-2025-05
diff --git a/bash_scripts/CC-Crawls/Mar2025.txt b/bash_scripts/CC-Crawls/Mar2025.txt
@@ -0,0 +1 @@
+CC-MAIN-2025-13
diff --git a/bash_scripts/end-to-end-url.sh b/bash_scripts/end-to-end-url.sh
@@ -88,4 +88,4 @@ for data_type in  "${cc_file_types[@]}" ; do
       fi
   echo "********************** End Of $data_type Task **********************"
   done
-done < "$CRAWL_LIST_FILE"
+done < "$CRAWL_LIST_FILE"
diff --git a/bash_scripts/end-to-end.sh b/bash_scripts/end-to-end.sh
@@ -40,10 +40,18 @@ else
       spark_table_name=$6
 fi
 
+
+if [ -z "$7" ]; then
+      listing=0
+else
+      listing=$7
+fi
+
 echo "cc_file_types= ${cc_file_types[@]}"
 echo "start_idx=$start_idx end_idx=$end_idx"
 echo "seed_list=$seed_list"
 echo "spark_table_name=$spark_table_name"
+echo "listing path=$listing"
 
 CRAWL_ARG="$1"
 
@@ -68,10 +76,12 @@ for data_type in  "${cc_file_types[@]}" ; do
       echo "Processing $CRAWL..."
       echo "Removing previous $CRAWL spark-warehouse"
       rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/$CRAWL"
+      rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/warc_index_table_$CRAWL"
       echo $CRAWL
+      CRAWL_LowerCase=${CRAWL,,}
       echo "################################### run get data @ $(date '+%Y-%m-%d %H:%M:%S') ###################################"
-      echo "$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]"
-      "$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]"
+      echo "$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]" $listing
+      "$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]" $listing
       echo "Data Downloaded for $CRAWL."
       if [ "$data_type" = "wat" ]; then
         echo "################ Start Processing Processing $data_type Files ######################"
@@ -84,14 +94,24 @@ for data_type in  "${cc_file_types[@]}" ; do
       elif [ "$data_type" = "wet" ]; then
          echo "#####################  run_wet_content_extraction @ $(date '+%Y-%m-%d %H:%M:%S') #####################"
          if [ -z "$SCRATCH" ]; then
-            rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
+            rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
          else
-            rm -rf "$SCRATCH/spark-warehouse/wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
+            rm -rf "$SCRATCH/spark-warehouse/wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
          fi
-         echo "$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}" "$seed_list"
-         "$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}" "$seed_list"
+         echo "$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
+         "$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
          echo "wet_extract_content_table constructed for $CRAWL batch_${start_idx}_${end_idx}"
+      elif [ "$data_type" = "cc-index-table" ]; then # collect content page wet-files from the CC Index (process only ~300 files )
+         echo "#####################  run_filter_index @ $(date '+%Y-%m-%d %H:%M:%S') #####################"
+         if [ -z "$SCRATCH" ]; then
+            rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/cc-index-table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
+         else
+            rm -rf "$SCRATCH/spark-warehouse/cc-index-table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
+         fi
+         echo "$SCRIPT_DIR/run_filter_warc-index.sh" "$CRAWL" "warc_index_table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
+         "$SCRIPT_DIR/run_filter_warc-index.sh" "$CRAWL" "warc_index_table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
+         echo "filter_warc-index table constructed for $CRAWL batch_${start_idx}_${end_idx}"
       fi
   echo "********************** End Of $data_type Task @ $(date '+%Y-%m-%d %H:%M:%S') **********************"
   done
-done #< "$CRAWLS"
+done
diff --git a/bash_scripts/get_data.sh b/bash_scripts/get_data.sh
@@ -6,12 +6,14 @@ fetch_with_retries() {
   local tmp="${out}.part" i status sleep_time
   mkdir -p "$(dirname "$out")"
   for ((i=1; i<=tries; i++)); do
-    wget -c --tries=1 \
+    wget -q -c --tries=1 \
          --retry-connrefused \
          --retry-on-http-error=429,500,502,503,504 \
          --timeout=60 --read-timeout=60 \
          -O "$tmp" "$url" && status=0 || status=$?
-    if [[ $status -eq 0 ]] && gzip -t "$tmp" 2>/dev/null; then
+#    echo "status=$status tmp=$tmp"
+    if [[ $status -eq 0 ]] && ([[ $url = *.parquet ]] || ([[ $url = *.gz ]] && gzip -t "$tmp" 2>/dev/null)); then
+      echo "url=$url"
       mv -f "$tmp" "$out"
       return 0
     fi
@@ -41,7 +43,7 @@ else
 fi
 
 if [ -z "$3" ]; then
-      end_idx=30
+      end_idx=10
 else
       end_idx=$3
 fi
@@ -54,8 +56,16 @@ else
     IFS=',' read -ra cc_file_types <<< "$cleaned"  # 2. Convert comma-separated string to array0
 fi
 
+
+if [ -z "$5" ]; then
+      listing="0"
+else
+      listing=$5
+fi
+
 echo "cc_file_types= ${cc_file_types[@]}"
 echo "start_idx=$start_idx end_idx=$end_idx"
+echo "listing path=$listing"
 
 # Get the root of the project (one level above this script's directory)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -85,40 +95,78 @@ for data_type in  "${cc_file_types[@]}" ; do
   echo "Downloading Common Crawl paths listings (${data_type} files of $CRAWL)..."
 
   mkdir -p "$DATA_DIR/crawl-data/$CRAWL/"
-  listing="$DATA_DIR/crawl-data/$CRAWL/$data_type.paths.gz"
-  cd "$DATA_DIR/crawl-data/$CRAWL/"
-  wget --timestamping "$BASE_URL/crawl-data/$CRAWL/$data_type.paths.gz"
-  sleep 2
-  cd -
+  all_listing_content_path="$INPUT_DIR/${CRAWL}_test_all_${data_type}.txt"
+  echo "all_listing_content_path=$all_listing_content_path"
+
+  if [[ "$listing" == "0" ]]; then
+      echo "listing is not provided"
+      listing="$DATA_DIR/crawl-data/$CRAWL/$data_type.paths.gz"
+      if [ -e "$all_listing_content_path" ]; then ## listing paths has been downloaded in previouse batches
+          echo "$all_listing_content_path exist."
+          listing_content=$(<"$all_listing_content_path")  
+      else  ## listing paths is to be downloaded
+          cd "$DATA_DIR/crawl-data/$CRAWL/"
+          wget  -q --timestamping "$BASE_URL/crawl-data/$CRAWL/$data_type.paths.gz"
+          cd -
+          echo "Downloading ${data_type} file paths..."
+          file=$(gzip -dc "$listing" | head -1)
+          full_path="$DATA_DIR/$file"
+          mkdir -p "$(dirname "$full_path")"
+          cd "$(dirname "$full_path")"
+          wget -q --timestamping "$BASE_URL/$file"
+          cd -
+          input="$INPUT_DIR/all_${data_type}_${CRAWL}.txt"
+          echo "All ${data_type} files of ${CRAWL}: $input"
+          listing_content=$(gzip -dc "$listing")
+          if [ -e "$all_listing_content_path" ]; then
+              rm "$all_listing_content_path"
+          fi
+          echo "$listing_content" >>"$all_listing_content_path"
+      fi
+  else   ############### Listing paths is given in a certian order (index)
+      echo "listing is provided"
+      listing_content=$(<"$listing")    
+      # echo "$listing_content"
+      if [ -e "$all_listing_content_path" ]; then
+              rm "$all_listing_content_path"
+      fi
+      echo "$listing_content" >>"$all_listing_content_path"
+  fi    
 
   echo "Downloading sample ${data_type} file..."
   # make sample fetch non-fatal so we reach the main loop even if it 503s
   file="$(gzip -dc "$listing" | head -1 || true)"
   if [ -n "$file" ]; then
     full_path="$DATA_DIR/$file"
     mkdir -p "$(dirname "$full_path")"
-    ( cd "$(dirname "$full_path")" && wget --timestamping "$BASE_URL/$file" ) || \
+    ( cd "$(dirname "$full_path")" && wget -q --timestamping "$BASE_URL/$file" ) || \
       echo "[WARN] sample $data_type fetch failed; continuing"
   fi
 
   input="$INPUT_DIR/all_${data_type}_${CRAWL}.txt"
   echo "All ${data_type} files of ${CRAWL}: $input"
-  listing_content=$(gzip -dc "$listing")
-  all_listing_content_path="$INPUT_DIR/test_all_${data_type}.txt"
-  echo "file:$listing_content" >>"$all_listing_content_path"
+  all_listing_content_path="$INPUT_DIR/${CRAWL}_test_all_${data_type}.txt"
+
+  if [[ "$data_type" = "cc-index-table" ]]; then
+    listing_content=$(gzip -dc "$listing" | grep -F "/subset=warc/")
+    # echo "cc-index-table listing_content=$listing_content"
+  elif ["$listing" = "0" ]; then
+    listing_content=$(gzip -dc "$listing")
+  fi
 
+  echo "file:$listing_content" >>"$all_listing_content_path"
   listing_FilesCount=$(wc -l <<< "$listing_content")
   echo "listing_FilesCount=$listing_FilesCount"
   if [ "$listing_FilesCount" -lt "$end_idx" ] ; then
     end_idx=$listing_FilesCount
   fi
+  echo "end_idx=$end_idx"
   FilesCount=$((end_idx - start_idx + 1))
+  input="$INPUT_DIR/${CRAWL}_test_${data_type}_${start_idx}_${end_idx}.txt"
   start_idx=$((start_idx + 1))
   echo "To Process FilesCount=$FilesCount"
-
   wat_files=$(echo "$listing_content" | tail -n +$start_idx | head -n $FilesCount)
-  echo "Writing input file listings..."
-  input="$INPUT_DIR/test_${data_type}.txt"
+  echo "Writing input file listings..."  
   echo "Test file: $input"
   if [ -e "$input" ]; then
     rm "$input"
@@ -137,15 +185,29 @@ for data_type in  "${cc_file_types[@]}" ; do
   : > "$fail_log"
 
   while IFS= read -r wat_file; do
-    first=$(echo "$wat_file" | awk -F '/'$data_type'/' '{print $1}')
+    echo "wat_file=$wat_file"
+    if [[ "$data_type" = "cc-index-table" ]]; then
+      first=$(echo "$wat_file" | awk -F '/subset=warc/' '{print $1}')
+    else
+      first=$(echo "$wat_file" | awk -F '/'$data_type'/' '{print $1}')
+    fi
+    # echo "first=$first"
     file_path="$DATA_DIR/$wat_file"
-    target_dir="$DATA_DIR/$first/$data_type/"
+    # echo "file_path=$file_path"
+    if [[ "$data_type" = "cc-index-table" ]]; then
+      target_dir="$DATA_DIR/$first/subset=warc/"
+    else
+      target_dir="$DATA_DIR/$first/$data_type/"
+    fi
+    # echo "target_dir=$target_dir"
     target_file="${target_dir}$(basename "$wat_file")"
+    # echo "target_file=$target_file"
 
     if [ -f "$file_path" ] || [ -f "$target_file" ]; then
       # verify gzip; re-download if corrupt
       test -f "$file_path" && cand="$file_path" || cand="$target_file"
-      if gzip -t "$cand" 2>/dev/null; then
+      echo "cand=$cand"
+      if [[ $cand = *.parquet ]] || ([[ $cand = *.gz ]] && gzip -t "$cand" 2>/dev/null); then
         echo "File '$cand' exists."
         downloaded=$((downloaded+1))
         continue
@@ -168,7 +230,5 @@ for data_type in  "${cc_file_types[@]}" ; do
     fi
 
   done <<< "$wat_files"
-
   echo "[SUMMARY] $CRAWL type=$data_type downloaded=$downloaded skipped=$skipped fail_log=$fail_log"
-
-done
+done
diff --git a/bash_scripts/job_content_ext.sh b/bash_scripts/job_content_ext.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#SBATCH --ntasks=1
+#SBATCH --partition=long-cpu
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=10
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=/home/mila/a/abdallah/scratch/jobs_log/cc-content-ext/cc-content-ext_job_%j.out
+#SBATCH --error=/home/mila/a/abdallah/scratch/jobs_log/cc-content-ext/cc-content-ext_job_%j.err
+
+# Exit on error
+set -e
+
+
+if [ -z "$1" ]; then
+      CRAWL=ccmain202508
+else
+      CRAWL=$1
+fi
+CRAWL=${CRAWL,,}
+wetFilesOrder="credigraph_${CRAWL}_wetFilesOrder.txt"
+if [ -z "$2" ]; then
+      Month=Feb2025
+else
+      Month=$2
+fi
+
+if [ -z "$3" ]; then
+      sidx=0
+else
+      sidx=$3
+fi
+
+if [ -z "$4" ]; then
+      eidx=10
+else
+      eidx=$4
+fi
+
+if [ -z "$5" ]; then
+      batch_size=10
+else
+      batch_size=$5
+fi
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+echo "CRAWL:     $CRAWL"
+echo "Month:     $Month"
+echo "sidx:     $sidx"
+echo "eidx:     $eidx"
+echo "batch_size:     $batch_size"
+export JAVA_HOME=~/jdk-17.0.12/
+export PATH=$PATH:$JAVA_HOME/bin
+
+# Execute Python script
+# Use `uv run --offline` on clusters without internet access on compute nodes.
+for ((i=$sidx; i<$eidx; i+=$batch_size)); do
+    echo "#########################################################################################"
+    echo ./end-to-end.sh  CC-Crawls/${Month}.txt $i $((i+$batch_size-1)) [wet] ../data/${Month}/${Month}_domains_${sidx}_${eidx}.csv content_ext_table spark-warehouse/$wetFilesOrder
+    ./end-to-end.sh  CC-Crawls/${Month}.txt $i $((i+$batch_size-1)) [wet] ../data/${Month}/${Month}_domains_${sidx}_${eidx}.csv content_ext_table spark-warehouse/$wetFilesOrder
+    # rm -r  ~/scratch/crawl-data/CC-MAIN-2025-08/segments
+done
diff --git a/bash_scripts/job_content_ext_sbatch10.sh b/bash_scripts/job_content_ext_sbatch10.sh
@@ -0,0 +1,21 @@
+# CRAWL=ccmain202508
+# Month=Feb2025
+
+CRAWL=ccmain202505
+Month=Jan2025
+
+# CRAWL=ccmain202513
+# Month=Mar2025
+
+
+
+batch_size=5000
+for ((i=50000; i<70000; i+=$batch_size)); do
+    if ((i >= 20000)); then
+        batch_size=10000
+    fi
+    end=$(($i+$batch_size))
+    echo sbatch ./job_content_ext.sh $CRAWL $Month $i $end 50
+    sbatch ./job_content_ext.sh $CRAWL $Month $i $end 50
+done
+# sbatch ./job_content_ext.sh ccmain202508 Feb2025 0 10000 50
diff --git a/bash_scripts/job_index_build_sbatch.sh b/bash_scripts/job_index_build_sbatch.sh
@@ -0,0 +1,8 @@
+# CRAWL=ccmain202508
+CRAWL=None
+# Month=Feb2025
+# Month=Jan2025
+Month=Mar2025
+batch_size=10
+echo sbatch ./run_in_batches_warc-index.sh $CRAWL $Month 240 300 $batch_size
+sbatch ./run_in_batches_warc-index.sh $CRAWL $Month 240 300 $batch_size