Example: Locating and Adding JARs to Spark 2 Configuration
This example shows how to discover the location of JAR files installed with Spark 2, and add them to the Spark 2 configuration.
# # Using Avro data # # This example shows how to use a JAR file on the local filesystem on # Spark on Yarn. from __future__ import print_function import os,sys import os.path from functools import reduce from pyspark.sql import SparkSession from pyspark.files import SparkFiles # Add the data file to HDFS for consumption by the Spark executors. !hdfs dfs -put resources/users.avro /tmp # Find the example JARs provided by the Spark parcel. This parcel # is available on both the driver, which runs in Cloudera Machine Learning, and the # executors, which run on Yarn. exampleDir = os.path.join(os.environ["SPARK_HOME"], "examples/jars") exampleJars = [os.path.join(exampleDir, x) for x in os.listdir(exampleDir)] # Add the Spark JARs to the Spark configuration to make them available for use. spark = SparkSession\ .builder\ .config("spark.jars", ",".join(exampleJars))\ .appName("AvroKeyInputFormat")\ .getOrCreate() sc = spark.sparkContext # Read the schema. schema = open("resources/user.avsc").read() conf = {"avro.schema.input.key": schema } avro_rdd = sc.newAPIHadoopFile( "/tmp/users.avro", # This is an HDFS path! "org.apache.avro.mapreduce.AvroKeyInputFormat", "org.apache.avro.mapred.AvroKey", "org.apache.hadoop.io.NullWritable", keyConverter="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter", conf=conf) output = avro_rdd.map(lambda x: x[0]).collect() for k in output: print(k) spark.stop()