Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
# Here you need to have same Python version on your local machine and on worker node i.e. EC2. here both should have python3.
import os
import sys
os.environ["PYSPARK_PYTHON"] = "/opt/cloudera/parcels/Anaconda/bin/python"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_232-cloudera/jre"
os.environ["SPARK_HOME"]="/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")
# Creating Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Mllib_Overview').getOrCreate()
#Reading Data from a CSV file
#Inferring Schema and Setting Header as True
df1 = spark.read.csv('auto-miles-per-gallon-Raw.csv', header=True, inferSchema=False)
#Displaying samples
df1.show()