summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSanto Cariotti <santo@dcariotti.me>2024-12-27 22:22:35 +0100
committerSanto Cariotti <santo@dcariotti.me>2024-12-27 22:22:35 +0100
commit299f5ab9c38834fc58b2f2a434c1495ac3d1c554 (patch)
tree319dbf5aa7a9d507b985760e860934484002ca4b
parentfc35b02c84c94d5965a8317df6312c9a3ececc68 (diff)
Add scripts
-rwxr-xr-xscripts/00-create-service-account.sh16
-rwxr-xr-xscripts/01-create-bucket.sh9
-rwxr-xr-xscripts/02-dataproc-create-cluster.sh10
-rwxr-xr-xscripts/03-update-network-for-dataproc.sh6
-rwxr-xr-xscripts/04-dataproc-copy-jar.sh8
-rwxr-xr-xscripts/05-dataproc-submit.sh9
-rwxr-xr-xscripts/06-cleanup.sh5
7 files changed, 63 insertions, 0 deletions
diff --git a/scripts/00-create-service-account.sh b/scripts/00-create-service-account.sh
new file mode 100755
index 0000000..984e55b
--- /dev/null
+++ b/scripts/00-create-service-account.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+gcloud iam service-accounts create ${SERVICE_ACCOUNT} \
+ --description="Spark access account to Google Cloud Buckets" \
+ --display-name="Spark to Bucket"
+
+gcloud projects add-iam-policy-binding ${PROJECT} \
+ --member="serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com" \
+ --role="roles/storage.objectAdmin"
+
+gcloud projects add-iam-policy-binding ${PROJECT} \
+ --member="serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com" \
+ --role="roles/dataproc.worker"
+
+gcloud iam service-accounts keys create ./google-service-account-key.json \
+ --iam-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com
diff --git a/scripts/01-create-bucket.sh b/scripts/01-create-bucket.sh
new file mode 100755
index 0000000..fb853c9
--- /dev/null
+++ b/scripts/01-create-bucket.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+read -p "Enter 'order_products.csv' path: " path
+
+gcloud storage buckets create gs://$BUCKET_NAME --location=eu
+gcloud storage buckets add-iam-policy-binding gs://${BUCKET_NAME} \
+ --member="serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com" \
+ --role="roles/storage.objectCreator"
+gcloud storage cp $path gs://$BUCKET_NAME/input/
diff --git a/scripts/02-dataproc-create-cluster.sh b/scripts/02-dataproc-create-cluster.sh
new file mode 100755
index 0000000..10c7d0c
--- /dev/null
+++ b/scripts/02-dataproc-create-cluster.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+gcloud dataproc clusters create ${CLUSTER} \
+ --project=${PROJECT} \
+ --region=${REGION} \
+ --service-account=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com \
+ --master-boot-disk-size 240 \
+ --worker-boot-disk-size 240 \
+ --num-workers 1 \
+ --single-node
diff --git a/scripts/03-update-network-for-dataproc.sh b/scripts/03-update-network-for-dataproc.sh
new file mode 100755
index 0000000..2d27945
--- /dev/null
+++ b/scripts/03-update-network-for-dataproc.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+gcloud compute networks subnets update default \
+ --region $REGION \
+ --enable-private-ip-google-access
+
diff --git a/scripts/04-dataproc-copy-jar.sh b/scripts/04-dataproc-copy-jar.sh
new file mode 100755
index 0000000..de8795f
--- /dev/null
+++ b/scripts/04-dataproc-copy-jar.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+cd ./co-purchase-analysis
+SCALA_VERSION=2.12.10 sbt clean package
+cd -
+
+gcloud storage cp co-purchase-analysis/target/scala-2.12/co-purchase-analysis_2.12-1.0.jar \
+ gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar
diff --git a/scripts/05-dataproc-submit.sh b/scripts/05-dataproc-submit.sh
new file mode 100755
index 0000000..dfc5498
--- /dev/null
+++ b/scripts/05-dataproc-submit.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+gcloud dataproc jobs submit spark \
+ --cluster=${CLUSTER} \
+ --jar=gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar \
+ --region=${REGION} \
+ --properties spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \
+ -- gs://${BUCKET_NAME}/input/ gs://${BUCKET_NAME}/output/
+
diff --git a/scripts/06-cleanup.sh b/scripts/06-cleanup.sh
new file mode 100755
index 0000000..50c10f7
--- /dev/null
+++ b/scripts/06-cleanup.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+gcloud storage rm -r gs://${BUCKET_NAME}
+gcloud dataproc clusters delete ${CLUSTER} --region=${REGION}
+gcloud iam service-accounts delete ${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com