summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSanto Cariotti <santo@dcariotti.me>2024-12-28 15:17:04 +0100
committerSanto Cariotti <santo@dcariotti.me>2024-12-28 15:17:04 +0100
commit2aa010e47387f5c60d63824dce65f76f22eecddc (patch)
treecf39154c7e58c697da3d9c0e00fad711fe9e59c0
parent246369828ecdaf879923b19ff222881cbe6c3953 (diff)
Check on scripts + update num worker
-rwxr-xr-xscripts/02-dataproc-copy-jar.sh (renamed from scripts/04-dataproc-copy-jar.sh)0
-rwxr-xr-xscripts/02-dataproc-create-cluster.sh11
-rwxr-xr-xscripts/04-dataproc-create-cluster.sh38
-rwxr-xr-xscripts/05-dataproc-submit.sh41
-rwxr-xr-xscripts/06-dataproc-update-cluster.sh40
-rwxr-xr-xscripts/07-cleanup.sh (renamed from scripts/06-cleanup.sh)0
6 files changed, 113 insertions, 17 deletions
diff --git a/scripts/04-dataproc-copy-jar.sh b/scripts/02-dataproc-copy-jar.sh
index de8795f..de8795f 100755
--- a/scripts/04-dataproc-copy-jar.sh
+++ b/scripts/02-dataproc-copy-jar.sh
diff --git a/scripts/02-dataproc-create-cluster.sh b/scripts/02-dataproc-create-cluster.sh
deleted file mode 100755
index 1fee8b7..0000000
--- a/scripts/02-dataproc-create-cluster.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/sh
-
-gcloud dataproc clusters create ${CLUSTER} \
- --project=${PROJECT} \
- --region=${REGION} \
- --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \
- --num-workers 2 \
- --master-boot-disk-size 240 \
- --worker-boot-disk-size 240 \
- --worker-machine-type n1-standard-2 \
- --master-machine-type n1-standard-2
diff --git a/scripts/04-dataproc-create-cluster.sh b/scripts/04-dataproc-create-cluster.sh
new file mode 100755
index 0000000..ada258d
--- /dev/null
+++ b/scripts/04-dataproc-create-cluster.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+set -eu
+
+if [ "$#" -ne 1 ]; then
+ echo "Usage: 'sh ${PWD}/$0 <num-workers>'"
+ exit 1
+fi
+
+
+NUM_WORKERS="$1"
+if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then
+ echo "<num-workers> must be 1, 2, 3, or 4"
+ exit 1
+fi
+
+
+COMMON_PARAMS="\
+ --project=${PROJECT} \
+ --region=${REGION} \
+ --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \
+ --master-boot-disk-size=240 \
+ --worker-boot-disk-size=240 \
+ --worker-machine-type=n1-standard-2 \
+ --master-machine-type=n1-standard-2"
+
+
+if [ "$NUM_WORKERS" -eq 1 ]; then
+ echo ">>>> Creating a single-node cluster..."
+ gcloud dataproc clusters create "${CLUSTER}" \
+ ${COMMON_PARAMS} \
+ --single-node
+else
+ echo ">>>> Creating a cluster with ${NUM_WORKERS} workers..."
+ gcloud dataproc clusters create "${CLUSTER}" \
+ ${COMMON_PARAMS} \
+ --num-workers="${NUM_WORKERS}"
+fi
diff --git a/scripts/05-dataproc-submit.sh b/scripts/05-dataproc-submit.sh
index dfc5498..b70e138 100755
--- a/scripts/05-dataproc-submit.sh
+++ b/scripts/05-dataproc-submit.sh
@@ -1,9 +1,38 @@
#!/bin/sh
-gcloud dataproc jobs submit spark \
- --cluster=${CLUSTER} \
- --jar=gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar \
- --region=${REGION} \
- --properties spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \
- -- gs://${BUCKET_NAME}/input/ gs://${BUCKET_NAME}/output/
+set -e
+
+INPUT_PATH="gs://${BUCKET_NAME}/input/"
+OUTPUT_PATH="gs://${BUCKET_NAME}/output"
+
+if [ -z "${BUCKET_NAME}" ] || [ -z "${CLUSTER}" ] || [ -z "${REGION}" ]; then
+ echo "Error: BUCKET_NAME, CLUSTER, and REGION environment variables must be set."
+ exit 1
+fi
+if gsutil ls "${OUTPUT_PATH}" > /dev/null 2>&1; then
+ echo ">>>> Output folder already exists. Renaming..."
+ UUID=$(cat /proc/sys/kernel/random/uuid)
+ NEW_OUTPUT_PATH="${OUTPUT_PATH}-${UUID}"
+
+ echo ">>>> Copying existing output folder to ${NEW_OUTPUT_PATH}..."
+ if gsutil -m cp -r "${OUTPUT_PATH}/" "${NEW_OUTPUT_PATH}/"; then
+ echo ">>>> Deleting original output folder..."
+ if gsutil -m rm -r "${OUTPUT_PATH}"; then
+ echo ">>>> Original output folder successfully renamed to ${NEW_OUTPUT_PATH}"
+ else
+ echo "Error: Failed to delete the original output folder after copying."
+ exit 1
+ fi
+ else
+ echo "Error: Failed to copy the output folder to ${NEW_OUTPUT_PATH}."
+ exit 1
+ fi
+fi
+
+gcloud dataproc jobs submit spark \
+ --cluster="${CLUSTER}" \
+ --jar="gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar" \
+ --region="${REGION}" \
+ --properties="spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem" \
+ -- "${INPUT_PATH}" "${OUTPUT_PATH}"
diff --git a/scripts/06-dataproc-update-cluster.sh b/scripts/06-dataproc-update-cluster.sh
new file mode 100755
index 0000000..cb098ef
--- /dev/null
+++ b/scripts/06-dataproc-update-cluster.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+set -eu
+
+if [ "$#" -ne 1 ]; then
+ echo "Usage: 'sh ${PWD}/$0 <num-workers>'"
+ exit 1
+fi
+
+NUM_WORKERS="$1"
+
+if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then
+ echo "<num-workers> must be 1, 2, 3, or 4"
+ exit 1
+fi
+
+
+# Handle single worker case
+if [ "$NUM_WORKERS" -eq 1 ]; then
+ if gcloud dataproc clusters describe "${CLUSTER}" --region="${REGION}" > /dev/null 2>&1; then
+ echo ">>>> Cluster exists. Destroying it..."
+ gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet
+ fi
+
+ echo ">>>> Creating a new cluster with 1 worker..."
+ eval "scripts/04-dataproc-create-cluster.sh 1"
+else
+ if ! gcloud dataproc clusters update "${CLUSTER}" \
+ --project="${PROJECT}" --region="${REGION}" \
+ --num-workers="${NUM_WORKERS}" > /dev/null 2>&1; then
+ echo ">>>> Cluster is a single node. Destroying it to update the number of workers..."
+ gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet
+
+ echo ">>>> Creating a new cluster with ${NUM_WORKERS} workers..."
+ eval "scripts/04-dataproc-create-cluster.sh ${NUM_WORKERS}"
+ else
+ echo ">>>> Successfully updated the cluster to ${NUM_WORKERS} workers."
+ fi
+fi
+
diff --git a/scripts/06-cleanup.sh b/scripts/07-cleanup.sh
index 50c10f7..50c10f7 100755
--- a/scripts/06-cleanup.sh
+++ b/scripts/07-cleanup.sh