diff options
author | Santo Cariotti <santo@dcariotti.me> | 2024-12-27 15:24:55 +0100 |
---|---|---|
committer | Santo Cariotti <santo@dcariotti.me> | 2024-12-27 15:24:55 +0100 |
commit | 2b82085fceb37e3a895ad8443f0de9eae6fc0435 (patch) | |
tree | c9e4fff03c6ba8ce89b98d8f5f60726dad3f9e88 | |
parent | 0a58d50a1480f6969f4f032c431446d3999b4437 (diff) |
Config Hadoop fs for Google and remove checks
-rw-r--r-- | co-purchase-analysis/src/main/scala/Main.scala | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/co-purchase-analysis/src/main/scala/Main.scala b/co-purchase-analysis/src/main/scala/Main.scala index ad79e9f..ec113b3 100644 --- a/co-purchase-analysis/src/main/scala/Main.scala +++ b/co-purchase-analysis/src/main/scala/Main.scala @@ -60,10 +60,6 @@ object CoPurchaseAnalysis { def checkArguments(args: Array[String]): Option[String] = { if (args.length != 2) { Some("You must define input file and output folder.") - } else if (!Files.exists(Paths.get(args(0)))) { - Some(s"Input file `${args(0)}` does not exist.") - } else if (Files.exists(Paths.get(args(1)))) { - Some(s"Output folder `${args(1)}` already exists.") } else { None } @@ -82,6 +78,11 @@ object CoPurchaseAnalysis { SparkSession.builder .appName(appName) .config("spark.master", master) + .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") + .config( + "spark.hadoop.google.cloud.auth.service.account.json.keyfile", + System.getenv("GOOGLE_APPLICATION_CREDENTIALS") + ) .getOrCreate() } @@ -108,7 +109,6 @@ object CoPurchaseAnalysis { * combinations */ def generateProductPairs(products: List[Int]): List[ProductPair] = { - val sortedProducts = products.sorted for { i <- products.indices.toList j <- (i + 1) until products.length @@ -133,9 +133,9 @@ object CoPurchaseAnalysis { /** Processes the order data to generate co-purchase statistics. * - * The processing pipeline includes: - * 1. Grouping orders by orderId 2. Generating product pairs for each order - * 3. Counting occurrences of each product pair + * The processing pipeline includes: (1) Grouping orders by orderId, (2) + * Generating product pairs for each order, (3) Counting occurrences of each + * product pair * * @param data * RDD containing OrderProduct instances |