summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSanto Cariotti <santo@dcariotti.me>2024-12-27 15:24:55 +0100
committerSanto Cariotti <santo@dcariotti.me>2024-12-27 15:24:55 +0100
commit2b82085fceb37e3a895ad8443f0de9eae6fc0435 (patch)
treec9e4fff03c6ba8ce89b98d8f5f60726dad3f9e88
parent0a58d50a1480f6969f4f032c431446d3999b4437 (diff)
Config Hadoop fs for Google and remove checks
-rw-r--r--co-purchase-analysis/src/main/scala/Main.scala16
1 files changed, 8 insertions, 8 deletions
diff --git a/co-purchase-analysis/src/main/scala/Main.scala b/co-purchase-analysis/src/main/scala/Main.scala
index ad79e9f..ec113b3 100644
--- a/co-purchase-analysis/src/main/scala/Main.scala
+++ b/co-purchase-analysis/src/main/scala/Main.scala
@@ -60,10 +60,6 @@ object CoPurchaseAnalysis {
def checkArguments(args: Array[String]): Option[String] = {
if (args.length != 2) {
Some("You must define input file and output folder.")
- } else if (!Files.exists(Paths.get(args(0)))) {
- Some(s"Input file `${args(0)}` does not exist.")
- } else if (Files.exists(Paths.get(args(1)))) {
- Some(s"Output folder `${args(1)}` already exists.")
} else {
None
}
@@ -82,6 +78,11 @@ object CoPurchaseAnalysis {
SparkSession.builder
.appName(appName)
.config("spark.master", master)
+ .config("spark.hadoop.google.cloud.auth.service.account.enable", "true")
+ .config(
+ "spark.hadoop.google.cloud.auth.service.account.json.keyfile",
+ System.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+ )
.getOrCreate()
}
@@ -108,7 +109,6 @@ object CoPurchaseAnalysis {
* combinations
*/
def generateProductPairs(products: List[Int]): List[ProductPair] = {
- val sortedProducts = products.sorted
for {
i <- products.indices.toList
j <- (i + 1) until products.length
@@ -133,9 +133,9 @@ object CoPurchaseAnalysis {
/** Processes the order data to generate co-purchase statistics.
*
- * The processing pipeline includes:
- * 1. Grouping orders by orderId 2. Generating product pairs for each order
- * 3. Counting occurrences of each product pair
+ * The processing pipeline includes: (1) Grouping orders by orderId, (2)
+ * Generating product pairs for each order, (3) Counting occurrences of each
+ * product pair
*
* @param data
* RDD containing OrderProduct instances