Check on scripts + update num worker

author: Santo Cariotti <santo@dcariotti.me> 2024-12-28 15:17:04 +0100
committer: Santo Cariotti <santo@dcariotti.me> 2024-12-28 15:17:04 +0100
commit: 2aa010e47387f5c60d63824dce65f76f22eecddc (patch)
tree: cf39154c7e58c697da3d9c0e00fad711fe9e59c0
parent: 246369828ecdaf879923b19ff222881cbe6c3953 (diff)
6 files changed, 113 insertions, 17 deletions
diff --git a/scripts/04-dataproc-copy-jar.sh b/scripts/02-dataproc-copy-jar.sh
index de8795f..de8795f 100755
--- a/scripts/04-dataproc-copy-jar.sh
+++ b/scripts/02-dataproc-copy-jar.sh
diff --git a/scripts/02-dataproc-create-cluster.sh b/scripts/02-dataproc-create-cluster.sh
deleted file mode 100755
index 1fee8b7..0000000
--- a/scripts/02-dataproc-create-cluster.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/sh
-
-gcloud dataproc clusters create ${CLUSTER} \
-    --project=${PROJECT} \
-    --region=${REGION} \
-    --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \
-    --num-workers 2 \
-    --master-boot-disk-size 240 \
-    --worker-boot-disk-size 240 \
-    --worker-machine-type n1-standard-2 \
-    --master-machine-type n1-standard-2
diff --git a/scripts/04-dataproc-create-cluster.sh b/scripts/04-dataproc-create-cluster.sh
new file mode 100755
index 0000000..ada258d
--- /dev/null
+++ b/scripts/04-dataproc-create-cluster.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+set -eu
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: 'sh ${PWD}/$0 <num-workers>'"
+    exit 1
+fi
+
+
+NUM_WORKERS="$1"
+if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then
+    echo "<num-workers> must be 1, 2, 3, or 4"
+    exit 1
+fi
+
+
+COMMON_PARAMS="\
+    --project=${PROJECT} \
+    --region=${REGION} \
+    --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \
+    --master-boot-disk-size=240 \
+    --worker-boot-disk-size=240 \
+    --worker-machine-type=n1-standard-2 \
+    --master-machine-type=n1-standard-2"
+
+
+if [ "$NUM_WORKERS" -eq 1 ]; then
+    echo ">>>> Creating a single-node cluster..."
+    gcloud dataproc clusters create "${CLUSTER}" \
+        ${COMMON_PARAMS} \
+        --single-node
+else
+    echo ">>>> Creating a cluster with ${NUM_WORKERS} workers..."
+    gcloud dataproc clusters create "${CLUSTER}" \
+        ${COMMON_PARAMS} \
+        --num-workers="${NUM_WORKERS}"
+fi
diff --git a/scripts/05-dataproc-submit.sh b/scripts/05-dataproc-submit.sh
index dfc5498..b70e138 100755
--- a/scripts/05-dataproc-submit.sh
+++ b/scripts/05-dataproc-submit.sh
@@ -1,9 +1,38 @@
 #!/bin/sh
 
-gcloud dataproc jobs submit spark \
-    --cluster=${CLUSTER} \
-    --jar=gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar \
-    --region=${REGION} \
-    --properties spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \
-    -- gs://${BUCKET_NAME}/input/ gs://${BUCKET_NAME}/output/
+set -e
+
+INPUT_PATH="gs://${BUCKET_NAME}/input/"
+OUTPUT_PATH="gs://${BUCKET_NAME}/output"
+
+if [ -z "${BUCKET_NAME}" ] || [ -z "${CLUSTER}" ] || [ -z "${REGION}" ]; then
+    echo "Error: BUCKET_NAME, CLUSTER, and REGION environment variables must be set."
+    exit 1
+fi
 
+if gsutil ls "${OUTPUT_PATH}" > /dev/null 2>&1; then
+    echo ">>>> Output folder already exists. Renaming..."
+    UUID=$(cat /proc/sys/kernel/random/uuid)
+    NEW_OUTPUT_PATH="${OUTPUT_PATH}-${UUID}"
+
+    echo ">>>> Copying existing output folder to ${NEW_OUTPUT_PATH}..."
+    if gsutil -m cp -r "${OUTPUT_PATH}/" "${NEW_OUTPUT_PATH}/"; then
+        echo ">>>> Deleting original output folder..."
+        if gsutil -m rm -r "${OUTPUT_PATH}"; then
+            echo ">>>> Original output folder successfully renamed to ${NEW_OUTPUT_PATH}"
+        else
+            echo "Error: Failed to delete the original output folder after copying."
+            exit 1
+        fi
+    else
+        echo "Error: Failed to copy the output folder to ${NEW_OUTPUT_PATH}."
+        exit 1
+    fi
+fi
+
+gcloud dataproc jobs submit spark \
+    --cluster="${CLUSTER}" \
+    --jar="gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar" \
+    --region="${REGION}" \
+    --properties="spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem" \
+    -- "${INPUT_PATH}" "${OUTPUT_PATH}"
diff --git a/scripts/06-dataproc-update-cluster.sh b/scripts/06-dataproc-update-cluster.sh
new file mode 100755
index 0000000..cb098ef
--- /dev/null
+++ b/scripts/06-dataproc-update-cluster.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+set -eu
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: 'sh ${PWD}/$0 <num-workers>'"
+    exit 1
+fi
+
+NUM_WORKERS="$1"
+
+if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then
+    echo "<num-workers> must be 1, 2, 3, or 4"
+    exit 1
+fi
+
+
+# Handle single worker case
+if [ "$NUM_WORKERS" -eq 1 ]; then
+    if gcloud dataproc clusters describe "${CLUSTER}" --region="${REGION}" > /dev/null 2>&1; then
+        echo ">>>> Cluster exists. Destroying it..."
+        gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet
+    fi
+
+    echo ">>>> Creating a new cluster with 1 worker..."
+    eval "scripts/04-dataproc-create-cluster.sh 1"
+else
+    if ! gcloud dataproc clusters update "${CLUSTER}" \
+        --project="${PROJECT}" --region="${REGION}" \
+        --num-workers="${NUM_WORKERS}" > /dev/null 2>&1; then
+        echo ">>>> Cluster is a single node. Destroying it to update the number of workers..."
+        gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet
+
+        echo ">>>> Creating a new cluster with ${NUM_WORKERS} workers..."
+        eval "scripts/04-dataproc-create-cluster.sh ${NUM_WORKERS}"
+    else
+        echo ">>>> Successfully updated the cluster to ${NUM_WORKERS} workers."
+    fi
+fi
+
diff --git a/scripts/06-cleanup.sh b/scripts/07-cleanup.sh
index 50c10f7..50c10f7 100755
--- a/scripts/06-cleanup.sh
+++ b/scripts/07-cleanup.sh
author	Santo Cariotti <santo@dcariotti.me>	2024-12-28 15:17:04 +0100
committer	Santo Cariotti <santo@dcariotti.me>	2024-12-28 15:17:04 +0100
commit	2aa010e47387f5c60d63824dce65f76f22eecddc (patch)
tree	cf39154c7e58c697da3d9c0e00fad711fe9e59c0
parent	246369828ecdaf879923b19ff222881cbe6c3953 (diff)