diff options
author | Santo Cariotti <santo@dcariotti.me> | 2024-12-27 22:22:35 +0100 |
---|---|---|
committer | Santo Cariotti <santo@dcariotti.me> | 2024-12-27 22:22:35 +0100 |
commit | 299f5ab9c38834fc58b2f2a434c1495ac3d1c554 (patch) | |
tree | 319dbf5aa7a9d507b985760e860934484002ca4b | |
parent | fc35b02c84c94d5965a8317df6312c9a3ececc68 (diff) |
Add scripts
-rwxr-xr-x | scripts/00-create-service-account.sh | 16 | ||||
-rwxr-xr-x | scripts/01-create-bucket.sh | 9 | ||||
-rwxr-xr-x | scripts/02-dataproc-create-cluster.sh | 10 | ||||
-rwxr-xr-x | scripts/03-update-network-for-dataproc.sh | 6 | ||||
-rwxr-xr-x | scripts/04-dataproc-copy-jar.sh | 8 | ||||
-rwxr-xr-x | scripts/05-dataproc-submit.sh | 9 | ||||
-rwxr-xr-x | scripts/06-cleanup.sh | 5 |
7 files changed, 63 insertions, 0 deletions
diff --git a/scripts/00-create-service-account.sh b/scripts/00-create-service-account.sh new file mode 100755 index 0000000..984e55b --- /dev/null +++ b/scripts/00-create-service-account.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +gcloud iam service-accounts create ${SERVICE_ACCOUNT} \ + --description="Spark access account to Google Cloud Buckets" \ + --display-name="Spark to Bucket" + +gcloud projects add-iam-policy-binding ${PROJECT} \ + --member="serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com" \ + --role="roles/storage.objectAdmin" + +gcloud projects add-iam-policy-binding ${PROJECT} \ + --member="serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com" \ + --role="roles/dataproc.worker" + +gcloud iam service-accounts keys create ./google-service-account-key.json \ + --iam-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com diff --git a/scripts/01-create-bucket.sh b/scripts/01-create-bucket.sh new file mode 100755 index 0000000..fb853c9 --- /dev/null +++ b/scripts/01-create-bucket.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +read -p "Enter 'order_products.csv' path: " path + +gcloud storage buckets create gs://$BUCKET_NAME --location=eu +gcloud storage buckets add-iam-policy-binding gs://${BUCKET_NAME} \ + --member="serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com" \ + --role="roles/storage.objectCreator" +gcloud storage cp $path gs://$BUCKET_NAME/input/ diff --git a/scripts/02-dataproc-create-cluster.sh b/scripts/02-dataproc-create-cluster.sh new file mode 100755 index 0000000..10c7d0c --- /dev/null +++ b/scripts/02-dataproc-create-cluster.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +gcloud dataproc clusters create ${CLUSTER} \ + --project=${PROJECT} \ + --region=${REGION} \ + --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \ + --master-boot-disk-size 240 \ + --worker-boot-disk-size 240 \ + --num-workers 1 \ + --single-node diff --git a/scripts/03-update-network-for-dataproc.sh b/scripts/03-update-network-for-dataproc.sh new file mode 100755 index 0000000..2d27945 --- /dev/null +++ b/scripts/03-update-network-for-dataproc.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +gcloud compute networks subnets update default \ + --region $REGION \ + --enable-private-ip-google-access + diff --git a/scripts/04-dataproc-copy-jar.sh b/scripts/04-dataproc-copy-jar.sh new file mode 100755 index 0000000..de8795f --- /dev/null +++ b/scripts/04-dataproc-copy-jar.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +cd ./co-purchase-analysis +SCALA_VERSION=2.12.10 sbt clean package +cd - + +gcloud storage cp co-purchase-analysis/target/scala-2.12/co-purchase-analysis_2.12-1.0.jar \ + gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar diff --git a/scripts/05-dataproc-submit.sh b/scripts/05-dataproc-submit.sh new file mode 100755 index 0000000..dfc5498 --- /dev/null +++ b/scripts/05-dataproc-submit.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +gcloud dataproc jobs submit spark \ + --cluster=${CLUSTER} \ + --jar=gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar \ + --region=${REGION} \ + --properties spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \ + -- gs://${BUCKET_NAME}/input/ gs://${BUCKET_NAME}/output/ + diff --git a/scripts/06-cleanup.sh b/scripts/06-cleanup.sh new file mode 100755 index 0000000..50c10f7 --- /dev/null +++ b/scripts/06-cleanup.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +gcloud storage rm -r gs://${BUCKET_NAME} +gcloud dataproc clusters delete ${CLUSTER} --region=${REGION} +gcloud iam service-accounts delete ${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com |