From 2aa010e47387f5c60d63824dce65f76f22eecddc Mon Sep 17 00:00:00 2001 From: Santo Cariotti Date: Sat, 28 Dec 2024 15:17:04 +0100 Subject: Check on scripts + update num worker --- scripts/02-dataproc-copy-jar.sh | 8 +++++++ scripts/02-dataproc-create-cluster.sh | 11 ---------- scripts/04-dataproc-copy-jar.sh | 8 ------- scripts/04-dataproc-create-cluster.sh | 38 ++++++++++++++++++++++++++++++++ scripts/05-dataproc-submit.sh | 41 ++++++++++++++++++++++++++++++----- scripts/06-cleanup.sh | 5 ----- scripts/06-dataproc-update-cluster.sh | 40 ++++++++++++++++++++++++++++++++++ scripts/07-cleanup.sh | 5 +++++ 8 files changed, 126 insertions(+), 30 deletions(-) create mode 100755 scripts/02-dataproc-copy-jar.sh delete mode 100755 scripts/02-dataproc-create-cluster.sh delete mode 100755 scripts/04-dataproc-copy-jar.sh create mode 100755 scripts/04-dataproc-create-cluster.sh delete mode 100755 scripts/06-cleanup.sh create mode 100755 scripts/06-dataproc-update-cluster.sh create mode 100755 scripts/07-cleanup.sh diff --git a/scripts/02-dataproc-copy-jar.sh b/scripts/02-dataproc-copy-jar.sh new file mode 100755 index 0000000..de8795f --- /dev/null +++ b/scripts/02-dataproc-copy-jar.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +cd ./co-purchase-analysis +SCALA_VERSION=2.12.10 sbt clean package +cd - + +gcloud storage cp co-purchase-analysis/target/scala-2.12/co-purchase-analysis_2.12-1.0.jar \ + gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar diff --git a/scripts/02-dataproc-create-cluster.sh b/scripts/02-dataproc-create-cluster.sh deleted file mode 100755 index 1fee8b7..0000000 --- a/scripts/02-dataproc-create-cluster.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -gcloud dataproc clusters create ${CLUSTER} \ - --project=${PROJECT} \ - --region=${REGION} \ - --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \ - --num-workers 2 \ - --master-boot-disk-size 240 \ - --worker-boot-disk-size 240 \ - --worker-machine-type n1-standard-2 \ - --master-machine-type n1-standard-2 diff --git a/scripts/04-dataproc-copy-jar.sh b/scripts/04-dataproc-copy-jar.sh deleted file mode 100755 index de8795f..0000000 --- a/scripts/04-dataproc-copy-jar.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -cd ./co-purchase-analysis -SCALA_VERSION=2.12.10 sbt clean package -cd - - -gcloud storage cp co-purchase-analysis/target/scala-2.12/co-purchase-analysis_2.12-1.0.jar \ - gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar diff --git a/scripts/04-dataproc-create-cluster.sh b/scripts/04-dataproc-create-cluster.sh new file mode 100755 index 0000000..ada258d --- /dev/null +++ b/scripts/04-dataproc-create-cluster.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +set -eu + +if [ "$#" -ne 1 ]; then + echo "Usage: 'sh ${PWD}/$0 '" + exit 1 +fi + + +NUM_WORKERS="$1" +if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then + echo " must be 1, 2, 3, or 4" + exit 1 +fi + + +COMMON_PARAMS="\ + --project=${PROJECT} \ + --region=${REGION} \ + --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \ + --master-boot-disk-size=240 \ + --worker-boot-disk-size=240 \ + --worker-machine-type=n1-standard-2 \ + --master-machine-type=n1-standard-2" + + +if [ "$NUM_WORKERS" -eq 1 ]; then + echo ">>>> Creating a single-node cluster..." + gcloud dataproc clusters create "${CLUSTER}" \ + ${COMMON_PARAMS} \ + --single-node +else + echo ">>>> Creating a cluster with ${NUM_WORKERS} workers..." + gcloud dataproc clusters create "${CLUSTER}" \ + ${COMMON_PARAMS} \ + --num-workers="${NUM_WORKERS}" +fi diff --git a/scripts/05-dataproc-submit.sh b/scripts/05-dataproc-submit.sh index dfc5498..b70e138 100755 --- a/scripts/05-dataproc-submit.sh +++ b/scripts/05-dataproc-submit.sh @@ -1,9 +1,38 @@ #!/bin/sh -gcloud dataproc jobs submit spark \ - --cluster=${CLUSTER} \ - --jar=gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar \ - --region=${REGION} \ - --properties spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \ - -- gs://${BUCKET_NAME}/input/ gs://${BUCKET_NAME}/output/ +set -e + +INPUT_PATH="gs://${BUCKET_NAME}/input/" +OUTPUT_PATH="gs://${BUCKET_NAME}/output" + +if [ -z "${BUCKET_NAME}" ] || [ -z "${CLUSTER}" ] || [ -z "${REGION}" ]; then + echo "Error: BUCKET_NAME, CLUSTER, and REGION environment variables must be set." + exit 1 +fi +if gsutil ls "${OUTPUT_PATH}" > /dev/null 2>&1; then + echo ">>>> Output folder already exists. Renaming..." + UUID=$(cat /proc/sys/kernel/random/uuid) + NEW_OUTPUT_PATH="${OUTPUT_PATH}-${UUID}" + + echo ">>>> Copying existing output folder to ${NEW_OUTPUT_PATH}..." + if gsutil -m cp -r "${OUTPUT_PATH}/" "${NEW_OUTPUT_PATH}/"; then + echo ">>>> Deleting original output folder..." + if gsutil -m rm -r "${OUTPUT_PATH}"; then + echo ">>>> Original output folder successfully renamed to ${NEW_OUTPUT_PATH}" + else + echo "Error: Failed to delete the original output folder after copying." + exit 1 + fi + else + echo "Error: Failed to copy the output folder to ${NEW_OUTPUT_PATH}." + exit 1 + fi +fi + +gcloud dataproc jobs submit spark \ + --cluster="${CLUSTER}" \ + --jar="gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar" \ + --region="${REGION}" \ + --properties="spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem" \ + -- "${INPUT_PATH}" "${OUTPUT_PATH}" diff --git a/scripts/06-cleanup.sh b/scripts/06-cleanup.sh deleted file mode 100755 index 50c10f7..0000000 --- a/scripts/06-cleanup.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -gcloud storage rm -r gs://${BUCKET_NAME} -gcloud dataproc clusters delete ${CLUSTER} --region=${REGION} -gcloud iam service-accounts delete ${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com diff --git a/scripts/06-dataproc-update-cluster.sh b/scripts/06-dataproc-update-cluster.sh new file mode 100755 index 0000000..cb098ef --- /dev/null +++ b/scripts/06-dataproc-update-cluster.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +set -eu + +if [ "$#" -ne 1 ]; then + echo "Usage: 'sh ${PWD}/$0 '" + exit 1 +fi + +NUM_WORKERS="$1" + +if [ "$NUM_WORKERS" -lt 1 ] || [ "$NUM_WORKERS" -gt 4 ]; then + echo " must be 1, 2, 3, or 4" + exit 1 +fi + + +# Handle single worker case +if [ "$NUM_WORKERS" -eq 1 ]; then + if gcloud dataproc clusters describe "${CLUSTER}" --region="${REGION}" > /dev/null 2>&1; then + echo ">>>> Cluster exists. Destroying it..." + gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet + fi + + echo ">>>> Creating a new cluster with 1 worker..." + eval "scripts/04-dataproc-create-cluster.sh 1" +else + if ! gcloud dataproc clusters update "${CLUSTER}" \ + --project="${PROJECT}" --region="${REGION}" \ + --num-workers="${NUM_WORKERS}" > /dev/null 2>&1; then + echo ">>>> Cluster is a single node. Destroying it to update the number of workers..." + gcloud dataproc clusters delete "${CLUSTER}" --region="${REGION}" --quiet + + echo ">>>> Creating a new cluster with ${NUM_WORKERS} workers..." + eval "scripts/04-dataproc-create-cluster.sh ${NUM_WORKERS}" + else + echo ">>>> Successfully updated the cluster to ${NUM_WORKERS} workers." + fi +fi + diff --git a/scripts/07-cleanup.sh b/scripts/07-cleanup.sh new file mode 100755 index 0000000..50c10f7 --- /dev/null +++ b/scripts/07-cleanup.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +gcloud storage rm -r gs://${BUCKET_NAME} +gcloud dataproc clusters delete ${CLUSTER} --region=${REGION} +gcloud iam service-accounts delete ${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com -- cgit v1.2.3-18-g5258