summaryrefslogtreecommitdiff
path: root/scripts/05-dataproc-submit.sh
diff options
context:
space:
mode:
authorSanto Cariotti <santo@dcariotti.me>2025-01-13 19:08:36 +0100
committerSanto Cariotti <santo@dcariotti.me>2025-01-13 19:09:09 +0100
commit80930bb9d945b2ffee0fdda78ebd8cbe1caa4dc2 (patch)
tree589ff2610bab1c2d2facabc103974179d0c289e9 /scripts/05-dataproc-submit.sh
parente6d14f7b388f4f866234d444668d8801cbf9661c (diff)
Partitions number as argument
Diffstat (limited to 'scripts/05-dataproc-submit.sh')
-rwxr-xr-xscripts/05-dataproc-submit.sh8
1 files changed, 7 insertions, 1 deletions
diff --git a/scripts/05-dataproc-submit.sh b/scripts/05-dataproc-submit.sh
index b70e138..b2c9e42 100755
--- a/scripts/05-dataproc-submit.sh
+++ b/scripts/05-dataproc-submit.sh
@@ -2,6 +2,12 @@
set -e
+if [ "$#" -ne 1 ]; then
+ echo "Usage: 'sh ${PWD}/$0 <num-partitions>'"
+ exit 1
+fi
+
+NUM_PARTITIONS="$1"
INPUT_PATH="gs://${BUCKET_NAME}/input/"
OUTPUT_PATH="gs://${BUCKET_NAME}/output"
@@ -35,4 +41,4 @@ gcloud dataproc jobs submit spark \
--jar="gs://${BUCKET_NAME}/scala/co-purchase-analysis_2.12-1.0.jar" \
--region="${REGION}" \
--properties="spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem" \
- -- "${INPUT_PATH}" "${OUTPUT_PATH}"
+ -- "${INPUT_PATH}" "${OUTPUT_PATH}" "${NUM_PARTITIONS}"