Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#Ours
data/
plots/
results/
saved_graphs/
spark-warehouse/
Expand Down Expand Up @@ -93,6 +94,10 @@ docs/_build/
# PyBuilder
.pybuilder/
target/
# parquet files
*.parquet
# log files
*.log*

# Jupyter Notebook
.ipynb_checkpoints
Expand Down Expand Up @@ -191,3 +196,8 @@ cython_debug/

# PyPI configuration file
.pypirc
bash_scripts/jdk-25_linux-x64_bin.tar
bash_scripts/jdk-17.0.12_linux-x64_bin.tar.gz
bash_scripts/jdk-25_linux-x64_bin.tar.gz
uv.lock
tgrag/distribution/kl_divergiance.py
1 change: 1 addition & 0 deletions bash_scripts/CC-Crawls/Feb2025.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CC-MAIN-2025-08
1 change: 1 addition & 0 deletions bash_scripts/CC-Crawls/Jan2025.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CC-MAIN-2025-05
1 change: 1 addition & 0 deletions bash_scripts/CC-Crawls/Mar2025.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CC-MAIN-2025-13
2 changes: 1 addition & 1 deletion bash_scripts/end-to-end-url.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,4 @@ for data_type in "${cc_file_types[@]}" ; do
fi
echo "********************** End Of $data_type Task **********************"
done
done < "$CRAWL_LIST_FILE"
done < "$CRAWL_LIST_FILE"
34 changes: 27 additions & 7 deletions bash_scripts/end-to-end.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,18 @@ else
spark_table_name=$6
fi


if [ -z "$7" ]; then
listing=0
else
listing=$7
fi

echo "cc_file_types= ${cc_file_types[@]}"
echo "start_idx=$start_idx end_idx=$end_idx"
echo "seed_list=$seed_list"
echo "spark_table_name=$spark_table_name"
echo "listing path=$listing"

CRAWL_ARG="$1"

Expand All @@ -68,10 +76,12 @@ for data_type in "${cc_file_types[@]}" ; do
echo "Processing $CRAWL..."
echo "Removing previous $CRAWL spark-warehouse"
rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/$CRAWL"
rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/warc_index_table_$CRAWL"
echo $CRAWL
CRAWL_LowerCase=${CRAWL,,}
echo "################################### run get data @ $(date '+%Y-%m-%d %H:%M:%S') ###################################"
echo "$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]"
"$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]"
echo "$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]" $listing
"$SCRIPT_DIR/get_data.sh" "$CRAWL" $start_idx $end_idx "[$data_type]" $listing
echo "Data Downloaded for $CRAWL."
if [ "$data_type" = "wat" ]; then
echo "################ Start Processing Processing $data_type Files ######################"
Expand All @@ -84,14 +94,24 @@ for data_type in "${cc_file_types[@]}" ; do
elif [ "$data_type" = "wet" ]; then
echo "##################### run_wet_content_extraction @ $(date '+%Y-%m-%d %H:%M:%S') #####################"
if [ -z "$SCRATCH" ]; then
rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
else
rm -rf "$SCRATCH/spark-warehouse/wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
rm -rf "$SCRATCH/spark-warehouse/wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
fi
echo "$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}" "$seed_list"
"$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL//-/}_${start_idx}_${end_idx}" "$seed_list"
echo "$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
"$SCRIPT_DIR/run_extract_wet_content.sh" "$CRAWL" "wet_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
echo "wet_extract_content_table constructed for $CRAWL batch_${start_idx}_${end_idx}"
elif [ "$data_type" = "cc-index-table" ]; then # collect content page wet-files from the CC Index (process only ~300 files )
echo "##################### run_filter_index @ $(date '+%Y-%m-%d %H:%M:%S') #####################"
if [ -z "$SCRATCH" ]; then
rm -rf "$PROJECT_ROOT/bash_scripts/spark-warehouse/cc-index-table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
else
rm -rf "$SCRATCH/spark-warehouse/cc-index-table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}/" # Remove re-created directories before running
fi
echo "$SCRIPT_DIR/run_filter_warc-index.sh" "$CRAWL" "warc_index_table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
"$SCRIPT_DIR/run_filter_warc-index.sh" "$CRAWL" "warc_index_table_${spark_table_name}_${CRAWL_LowerCase//-/}_${start_idx}_${end_idx}" "$seed_list" "$start_idx" "$end_idx"
echo "filter_warc-index table constructed for $CRAWL batch_${start_idx}_${end_idx}"
fi
echo "********************** End Of $data_type Task @ $(date '+%Y-%m-%d %H:%M:%S') **********************"
done
done #< "$CRAWLS"
done
102 changes: 81 additions & 21 deletions bash_scripts/get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ fetch_with_retries() {
local tmp="${out}.part" i status sleep_time
mkdir -p "$(dirname "$out")"
for ((i=1; i<=tries; i++)); do
wget -c --tries=1 \
wget -q -c --tries=1 \
--retry-connrefused \
--retry-on-http-error=429,500,502,503,504 \
--timeout=60 --read-timeout=60 \
-O "$tmp" "$url" && status=0 || status=$?
if [[ $status -eq 0 ]] && gzip -t "$tmp" 2>/dev/null; then
# echo "status=$status tmp=$tmp"
if [[ $status -eq 0 ]] && ([[ $url = *.parquet ]] || ([[ $url = *.gz ]] && gzip -t "$tmp" 2>/dev/null)); then
echo "url=$url"
mv -f "$tmp" "$out"
return 0
fi
Expand Down Expand Up @@ -41,7 +43,7 @@ else
fi

if [ -z "$3" ]; then
end_idx=30
end_idx=10
else
end_idx=$3
fi
Expand All @@ -54,8 +56,16 @@ else
IFS=',' read -ra cc_file_types <<< "$cleaned" # 2. Convert comma-separated string to array0
fi


if [ -z "$5" ]; then
listing="0"
else
listing=$5
fi

echo "cc_file_types= ${cc_file_types[@]}"
echo "start_idx=$start_idx end_idx=$end_idx"
echo "listing path=$listing"

# Get the root of the project (one level above this script's directory)
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
Expand Down Expand Up @@ -85,40 +95,78 @@ for data_type in "${cc_file_types[@]}" ; do
echo "Downloading Common Crawl paths listings (${data_type} files of $CRAWL)..."

mkdir -p "$DATA_DIR/crawl-data/$CRAWL/"
listing="$DATA_DIR/crawl-data/$CRAWL/$data_type.paths.gz"
cd "$DATA_DIR/crawl-data/$CRAWL/"
wget --timestamping "$BASE_URL/crawl-data/$CRAWL/$data_type.paths.gz"
sleep 2
cd -
all_listing_content_path="$INPUT_DIR/${CRAWL}_test_all_${data_type}.txt"
echo "all_listing_content_path=$all_listing_content_path"

if [[ "$listing" == "0" ]]; then
echo "listing is not provided"
listing="$DATA_DIR/crawl-data/$CRAWL/$data_type.paths.gz"
if [ -e "$all_listing_content_path" ]; then ## listing paths has been downloaded in previouse batches
echo "$all_listing_content_path exist."
listing_content=$(<"$all_listing_content_path")
else ## listing paths is to be downloaded
cd "$DATA_DIR/crawl-data/$CRAWL/"
wget -q --timestamping "$BASE_URL/crawl-data/$CRAWL/$data_type.paths.gz"
cd -
echo "Downloading ${data_type} file paths..."
file=$(gzip -dc "$listing" | head -1)
full_path="$DATA_DIR/$file"
mkdir -p "$(dirname "$full_path")"
cd "$(dirname "$full_path")"
wget -q --timestamping "$BASE_URL/$file"
cd -
input="$INPUT_DIR/all_${data_type}_${CRAWL}.txt"
echo "All ${data_type} files of ${CRAWL}: $input"
listing_content=$(gzip -dc "$listing")
if [ -e "$all_listing_content_path" ]; then
rm "$all_listing_content_path"
fi
echo "$listing_content" >>"$all_listing_content_path"
fi
else ############### Listing paths is given in a certian order (index)
echo "listing is provided"
listing_content=$(<"$listing")
# echo "$listing_content"
if [ -e "$all_listing_content_path" ]; then
rm "$all_listing_content_path"
fi
echo "$listing_content" >>"$all_listing_content_path"
fi

echo "Downloading sample ${data_type} file..."
# make sample fetch non-fatal so we reach the main loop even if it 503s
file="$(gzip -dc "$listing" | head -1 || true)"
if [ -n "$file" ]; then
full_path="$DATA_DIR/$file"
mkdir -p "$(dirname "$full_path")"
( cd "$(dirname "$full_path")" && wget --timestamping "$BASE_URL/$file" ) || \
( cd "$(dirname "$full_path")" && wget -q --timestamping "$BASE_URL/$file" ) || \
echo "[WARN] sample $data_type fetch failed; continuing"
fi

input="$INPUT_DIR/all_${data_type}_${CRAWL}.txt"
echo "All ${data_type} files of ${CRAWL}: $input"
listing_content=$(gzip -dc "$listing")
all_listing_content_path="$INPUT_DIR/test_all_${data_type}.txt"
echo "file:$listing_content" >>"$all_listing_content_path"
all_listing_content_path="$INPUT_DIR/${CRAWL}_test_all_${data_type}.txt"

if [[ "$data_type" = "cc-index-table" ]]; then
listing_content=$(gzip -dc "$listing" | grep -F "/subset=warc/")
# echo "cc-index-table listing_content=$listing_content"
elif ["$listing" = "0" ]; then
listing_content=$(gzip -dc "$listing")
fi

echo "file:$listing_content" >>"$all_listing_content_path"
listing_FilesCount=$(wc -l <<< "$listing_content")
echo "listing_FilesCount=$listing_FilesCount"
if [ "$listing_FilesCount" -lt "$end_idx" ] ; then
end_idx=$listing_FilesCount
fi
echo "end_idx=$end_idx"
FilesCount=$((end_idx - start_idx + 1))
input="$INPUT_DIR/${CRAWL}_test_${data_type}_${start_idx}_${end_idx}.txt"
start_idx=$((start_idx + 1))
echo "To Process FilesCount=$FilesCount"

wat_files=$(echo "$listing_content" | tail -n +$start_idx | head -n $FilesCount)
echo "Writing input file listings..."
input="$INPUT_DIR/test_${data_type}.txt"
echo "Writing input file listings..."
echo "Test file: $input"
if [ -e "$input" ]; then
rm "$input"
Expand All @@ -137,15 +185,29 @@ for data_type in "${cc_file_types[@]}" ; do
: > "$fail_log"

while IFS= read -r wat_file; do
first=$(echo "$wat_file" | awk -F '/'$data_type'/' '{print $1}')
echo "wat_file=$wat_file"
if [[ "$data_type" = "cc-index-table" ]]; then
first=$(echo "$wat_file" | awk -F '/subset=warc/' '{print $1}')
else
first=$(echo "$wat_file" | awk -F '/'$data_type'/' '{print $1}')
fi
# echo "first=$first"
file_path="$DATA_DIR/$wat_file"
target_dir="$DATA_DIR/$first/$data_type/"
# echo "file_path=$file_path"
if [[ "$data_type" = "cc-index-table" ]]; then
target_dir="$DATA_DIR/$first/subset=warc/"
else
target_dir="$DATA_DIR/$first/$data_type/"
fi
# echo "target_dir=$target_dir"
target_file="${target_dir}$(basename "$wat_file")"
# echo "target_file=$target_file"

if [ -f "$file_path" ] || [ -f "$target_file" ]; then
# verify gzip; re-download if corrupt
test -f "$file_path" && cand="$file_path" || cand="$target_file"
if gzip -t "$cand" 2>/dev/null; then
echo "cand=$cand"
if [[ $cand = *.parquet ]] || ([[ $cand = *.gz ]] && gzip -t "$cand" 2>/dev/null); then
echo "File '$cand' exists."
downloaded=$((downloaded+1))
continue
Expand All @@ -168,7 +230,5 @@ for data_type in "${cc_file_types[@]}" ; do
fi

done <<< "$wat_files"

echo "[SUMMARY] $CRAWL type=$data_type downloaded=$downloaded skipped=$skipped fail_log=$fail_log"

done
done
63 changes: 63 additions & 0 deletions bash_scripts/job_content_ext.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --partition=long-cpu
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=10
#SBATCH --mem=64G
#SBATCH --time=24:00:00
#SBATCH --output=/home/mila/a/abdallah/scratch/jobs_log/cc-content-ext/cc-content-ext_job_%j.out
#SBATCH --error=/home/mila/a/abdallah/scratch/jobs_log/cc-content-ext/cc-content-ext_job_%j.err

# Exit on error
set -e


if [ -z "$1" ]; then
CRAWL=ccmain202508
else
CRAWL=$1
fi
CRAWL=${CRAWL,,}
wetFilesOrder="credigraph_${CRAWL}_wetFilesOrder.txt"
if [ -z "$2" ]; then
Month=Feb2025
else
Month=$2
fi

if [ -z "$3" ]; then
sidx=0
else
sidx=$3
fi

if [ -z "$4" ]; then
eidx=10
else
eidx=$4
fi

if [ -z "$5" ]; then
batch_size=10
else
batch_size=$5
fi
# Echo time and hostname into log
echo "Date: $(date)"
echo "Hostname: $(hostname)"
echo "CRAWL: $CRAWL"
echo "Month: $Month"
echo "sidx: $sidx"
echo "eidx: $eidx"
echo "batch_size: $batch_size"
export JAVA_HOME=~/jdk-17.0.12/
export PATH=$PATH:$JAVA_HOME/bin

# Execute Python script
# Use `uv run --offline` on clusters without internet access on compute nodes.
for ((i=$sidx; i<$eidx; i+=$batch_size)); do
echo "#########################################################################################"
echo ./end-to-end.sh CC-Crawls/${Month}.txt $i $((i+$batch_size-1)) [wet] ../data/${Month}/${Month}_domains_${sidx}_${eidx}.csv content_ext_table spark-warehouse/$wetFilesOrder
./end-to-end.sh CC-Crawls/${Month}.txt $i $((i+$batch_size-1)) [wet] ../data/${Month}/${Month}_domains_${sidx}_${eidx}.csv content_ext_table spark-warehouse/$wetFilesOrder
# rm -r ~/scratch/crawl-data/CC-MAIN-2025-08/segments
done
21 changes: 21 additions & 0 deletions bash_scripts/job_content_ext_sbatch10.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# CRAWL=ccmain202508
# Month=Feb2025

CRAWL=ccmain202505
Month=Jan2025

# CRAWL=ccmain202513
# Month=Mar2025



batch_size=5000
for ((i=50000; i<70000; i+=$batch_size)); do
if ((i >= 20000)); then
batch_size=10000
fi
end=$(($i+$batch_size))
echo sbatch ./job_content_ext.sh $CRAWL $Month $i $end 50
sbatch ./job_content_ext.sh $CRAWL $Month $i $end 50
done
# sbatch ./job_content_ext.sh ccmain202508 Feb2025 0 10000 50
8 changes: 8 additions & 0 deletions bash_scripts/job_index_build_sbatch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# CRAWL=ccmain202508
CRAWL=None
# Month=Feb2025
# Month=Jan2025
Month=Mar2025
batch_size=10
echo sbatch ./run_in_batches_warc-index.sh $CRAWL $Month 240 300 $batch_size
sbatch ./run_in_batches_warc-index.sh $CRAWL $Month 240 300 $batch_size
Loading