diff options
author | Santo Cariotti <santo@dcariotti.me> | 2024-12-28 15:17:04 +0100 |
---|---|---|
committer | Santo Cariotti <santo@dcariotti.me> | 2024-12-28 15:17:04 +0100 |
commit | 2aa010e47387f5c60d63824dce65f76f22eecddc (patch) | |
tree | cf39154c7e58c697da3d9c0e00fad711fe9e59c0 | |
parent | 246369828ecdaf879923b19ff222881cbe6c3953 (diff) |
Check on scripts + update num worker
-rwxr-xr-x | scripts/02-dataproc-copy-jar.sh (renamed from scripts/04-dataproc-copy-jar.sh) | 0 | ||||
-rwxr-xr-x | scripts/02-dataproc-create-cluster.sh | 11 | ||||
-rwxr-xr-x | scripts/04-dataproc-create-cluster.sh | 38 | ||||
-rwxr-xr-x | scripts/05-dataproc-submit.sh | 41 | ||||
-rwxr-xr-x | scripts/06-dataproc-update-cluster.sh | 40 | ||||
-rwxr-xr-x | scripts/07-cleanup.sh (renamed from scripts/06-cleanup.sh) | 0 |
6 files changed, 113 insertions, 17 deletions
diff --git a/scripts/04-dataproc-copy-jar.sh b/scripts/02-dataproc-copy-jar.sh index de8795f..de8795f 100755 --- a/scripts/04-dataproc-copy-jar.sh +++ b/scripts/02-dataproc-copy-jar.sh diff --git a/scripts/02-dataproc-create-cluster.sh b/scripts/02-dataproc-create-cluster.sh deleted file mode 100755 index 1fee8b7..0000000 --- a/scripts/02-dataproc-create-cluster.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -gcloud dataproc clusters create ${CLUSTER} \ - --project=${PROJECT} \ - --region=${REGION} \ - --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \ - --num-workers 2 \ - --master-boot-disk-size 240 \ - --worker-boot-disk-size 240 \ - --worker-machine-type n1-standard-2 \ - --master-machine-type n1-standard-2 diff --git a/scripts/04-dataproc-create-cluster.sh b/scripts/04-dataproc-create-cluster.sh new file mode 100755 index 0000000..ada258d --- /dev/null +++ b/scripts/04-dataproc-create-cluster.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +set -eu + +if [ "$#" -ne 1 ]; then + echo "Usage: 'sh ${PWD}/$0 <num-workers>'" + exit 1 +fi + + +NUM_WORKERS="$1" +if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then + echo "<num-workers> must be 1, 2, 3, or 4" + exit 1 +fi + + +COMMON_PARAMS="\ + --project=${PROJECT} \ + --region=${REGION} \ + --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \ + --master-boot-disk-size=240 \ + --worker-boot-disk-size=240 \ + --worker-machine-type=n1-standard-2 \ + --master-machine-type=n1-standard-2" + + +if [ "$NUM_WORKERS" -eq 1 ]; then + echo ">>>> Creating a single-node cluster..." + gcloud dataproc clusters create "${CLUSTER}" \ + ${COMMON_PARAMS} \ + --single-node +else + echo ">>>> Creating a cluster with ${NUM_WORKERS} workers..." + gcloud dataproc clusters create "${CLUSTER}" \ + ${COMMON_PARAMS} \ + --num-workers="${NUM_WORKERS}" +fi diff --git a/scripts/05-dataproc-submit.sh b/scripts/05-dataproc-submit.sh index dfc5498..b70e138 100755 --- a/scripts/05-dataproc-submit.sh +++ b/scripts/05-dataproc-submit.sh @@ -1,9 +1,38 @@ #!/bin/sh -gcloud dataproc jobs submit spark \ - --cluster=${CLUSTER} \ - --jar=gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar \ - --region=${REGION} \ - --properties spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \ - -- gs://${BUCKET_NAME}/input/ gs://${BUCKET_NAME}/output/ +set -e + +INPUT_PATH="gs://${BUCKET_NAME}/input/" +OUTPUT_PATH="gs://${BUCKET_NAME}/output" + +if [ -z "${BUCKET_NAME}" ] || [ -z "${CLUSTER}" ] || [ -z "${REGION}" ]; then + echo "Error: BUCKET_NAME, CLUSTER, and REGION environment variables must be set." + exit 1 +fi +if gsutil ls "${OUTPUT_PATH}" > /dev/null 2>&1; then + echo ">>>> Output folder already exists. Renaming..." + UUID=$(cat /proc/sys/kernel/random/uuid) + NEW_OUTPUT_PATH="${OUTPUT_PATH}-${UUID}" + + echo ">>>> Copying existing output folder to ${NEW_OUTPUT_PATH}..." + if gsutil -m cp -r "${OUTPUT_PATH}/" "${NEW_OUTPUT_PATH}/"; then + echo ">>>> Deleting original output folder..." + if gsutil -m rm -r "${OUTPUT_PATH}"; then + echo ">>>> Original output folder successfully renamed to ${NEW_OUTPUT_PATH}" + else + echo "Error: Failed to delete the original output folder after copying." + exit 1 + fi + else + echo "Error: Failed to copy the output folder to ${NEW_OUTPUT_PATH}." + exit 1 + fi +fi + +gcloud dataproc jobs submit spark \ + --cluster="${CLUSTER}" \ + --jar="gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar" \ + --region="${REGION}" \ + --properties="spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem" \ + -- "${INPUT_PATH}" "${OUTPUT_PATH}" diff --git a/scripts/06-dataproc-update-cluster.sh b/scripts/06-dataproc-update-cluster.sh new file mode 100755 index 0000000..cb098ef --- /dev/null +++ b/scripts/06-dataproc-update-cluster.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +set -eu + +if [ "$#" -ne 1 ]; then + echo "Usage: 'sh ${PWD}/$0 <num-workers>'" + exit 1 +fi + +NUM_WORKERS="$1" + +if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then + echo "<num-workers> must be 1, 2, 3, or 4" + exit 1 +fi + + +# Handle single worker case +if [ "$NUM_WORKERS" -eq 1 ]; then + if gcloud dataproc clusters describe "${CLUSTER}" --region="${REGION}" > /dev/null 2>&1; then + echo ">>>> Cluster exists. Destroying it..." + gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet + fi + + echo ">>>> Creating a new cluster with 1 worker..." + eval "scripts/04-dataproc-create-cluster.sh 1" +else + if ! gcloud dataproc clusters update "${CLUSTER}" \ + --project="${PROJECT}" --region="${REGION}" \ + --num-workers="${NUM_WORKERS}" > /dev/null 2>&1; then + echo ">>>> Cluster is a single node. Destroying it to update the number of workers..." + gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet + + echo ">>>> Creating a new cluster with ${NUM_WORKERS} workers..." + eval "scripts/04-dataproc-create-cluster.sh ${NUM_WORKERS}" + else + echo ">>>> Successfully updated the cluster to ${NUM_WORKERS} workers." + fi +fi + diff --git a/scripts/06-cleanup.sh b/scripts/07-cleanup.sh index 50c10f7..50c10f7 100755 --- a/scripts/06-cleanup.sh +++ b/scripts/07-cleanup.sh |