Example: Locating and Adding JARs to Spark 2 Configuration
This example shows how to discover the location of JAR files installed with Spark 2, and add them to the Spark 2 configuration.
# # Using Avro data
#
# This example shows how to use a JAR file on the local filesystem on
# Spark on Yarn.
from __future__ import print_function
import os,sys
import os.path
from functools import reduce
from pyspark.sql import SparkSession
from pyspark.files import SparkFiles
# Add the data file to HDFS for consumption by the Spark executors.
!hdfs dfs -put resources/users.avro /tmp
# Find the example JARs provided by the Spark parcel. This parcel
# is available on both the driver, which runs in Cloudera Machine Learning, and the
# executors, which run on Yarn.
exampleDir = os.path.join(os.environ["SPARK_HOME"], "examples/jars")
exampleJars = [os.path.join(exampleDir, x) for x in os.listdir(exampleDir)]
# Add the Spark JARs to the Spark configuration to make them available for use.
spark = SparkSession\
.builder\
.config("spark.jars", ",".join(exampleJars))\
.appName("AvroKeyInputFormat")\
.getOrCreate()
sc = spark.sparkContext
# Read the schema.
schema = open("resources/user.avsc").read()
conf = {"avro.schema.input.key": schema }
avro_rdd = sc.newAPIHadoopFile(
"/tmp/users.avro", # This is an HDFS path!
"org.apache.avro.mapreduce.AvroKeyInputFormat",
"org.apache.avro.mapred.AvroKey",
"org.apache.hadoop.io.NullWritable",
keyConverter="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter",
conf=conf)
output = avro_rdd.map(lambda x: x[0]).collect()
for k in output:
print(k)
spark.stop()