Spark入门(四)Idea远程提交项目到spark集群
程序员文章站
2022-03-06 08:02:02
...
一、依赖包配置
scala与spark的相关依赖包,spark包后尾下划线的版本数字要跟scala的版本第一二位要一致,即2.11
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mk</groupId>
<artifactId>spark-test</artifactId>
<version>1.0</version>
<name>spark-test</name>
<url>http://spark.mk.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.11.1</scala.version>
<spark.version>2.4.4</spark.version>
<hadoop.version>2.6.0</hadoop.version>
</properties>
<dependencies>
<!-- scala依赖-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- spark依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
二、PI例子
java重新编写scala的PI例子
package com.mk;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import java.util.ArrayList;
import java.util.List;
public class App
{
public static void main( String[] args )
{
SparkConf sparkConf = new SparkConf();
if(System.getProperty("os.name").toLowerCase().contains("win")) {
// sparkConf.setMaster("local[2]");
// System.out.println("使用本地模拟是spark");
// }else
// {
sparkConf.setMaster("spark://hadoop01:7077,hadoop02:7077,hadoop03:7077");
sparkConf.set("spark.driver.host","192.168.10.126");//本地ip,必须与spark集群能够相互访问,如:同一个局域网
sparkConf.setJars(new String[] {".\\out\\artifacts\\spark_test\\spark-test.jar"});//项目构建生成的路径
}
SparkSession session = SparkSession.builder().appName("Pi").config(sparkConf).config(sparkConf).getOrCreate();
int slices =2;
int n = (int)Math.min(100_000L * slices, Integer.MAX_VALUE);
JavaSparkContext sparkContext = new JavaSparkContext(session.sparkContext());
List<Integer> list = new ArrayList<>(n);
for (int i = 0; i < n; i++)
list.add(i + 1);
int count = sparkContext.parallelize(list, slices).
map(v -> {
double x = Math.random() * 2 - 1;
double y = Math.random() * 2 - 1;
if (x * x + y * y < 1)
return 1;
return 0;
}).reduce((Integer a, Integer b) ->a+b);
System.out.println("PI:"+ 4.0 * count / n);
session.stop();
}
}
三、直接在idea本地运行
输出PI
四、局限性
注意:项目机器的本地ip,必须与spark集群能够相互访问,如:同一个局域网。
不在同一个网络提交失败,任务一直重试无法退出