Learn practical skills, build real-world projects, and advance your career
import os
import sys
os.environ["PYSPARK_PYTHON"] = "/opt/cloudera/parcels/Anaconda/bin/python"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_232-cloudera/"
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.9-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")
#pip install pyspark
Collecting pyspark Downloading pyspark-3.1.2.tar.gz (212.4 MB) Collecting py4j==0.10.9 Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB) Building wheels for collected packages: pyspark Building wheel for pyspark (setup.py): started Building wheel for pyspark (setup.py): finished with status 'done' Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880773 sha256=28c93708f13256711d24ac25fd3be44516792156567421d040bc3cc6290071f2 Stored in directory: c:\users\aruna\appdata\local\pip\cache\wheels\df\88\9e\58ef1f74892fef590330ca0830b5b6d995ba29b44f977b3926 Successfully built pyspark Installing collected packages: py4j, pyspark Successfully installed py4j-0.10.9 pyspark-3.1.2 Note: you may need to restart the kernel to use updated packages.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('demo').master("local").getOrCreate()
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-15-371cbd2bc282> in <module> ----> 1 spark = SparkSession.builder.appName('demo').master("local").getOrCreate() ~\anaconda3\lib\site-packages\pyspark\sql\session.py in getOrCreate(self) 226 sparkConf.set(key, value) 227 # This SparkContext may be an existing one. --> 228 sc = SparkContext.getOrCreate(sparkConf) 229 # Do not update `SparkConf` for existing `SparkContext`, as it's shared 230 # by all sessions. ~\anaconda3\lib\site-packages\pyspark\context.py in getOrCreate(cls, conf) 382 with SparkContext._lock: 383 if SparkContext._active_spark_context is None: --> 384 SparkContext(conf=conf or SparkConf()) 385 return SparkContext._active_spark_context 386 ~\anaconda3\lib\site-packages\pyspark\context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls) 142 " is not allowed as it is a security risk.") 143 --> 144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf) 145 try: 146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer, ~\anaconda3\lib\site-packages\pyspark\context.py in _ensure_initialized(cls, instance, gateway, conf) 329 with SparkContext._lock: 330 if not SparkContext._gateway: --> 331 SparkContext._gateway = gateway or launch_gateway(conf) 332 SparkContext._jvm = SparkContext._gateway.jvm 333 ~\anaconda3\lib\site-packages\pyspark\java_gateway.py in launch_gateway(conf, popen_kwargs) 99 else: 100 # preexec_fn not supported on Windows --> 101 proc = Popen(command, **popen_kwargs) 102 103 # Wait for the file to appear, or for the process to exit, whichever happens first. ~\anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text) 852 encoding=encoding, errors=errors) 853 --> 854 self._execute_child(args, executable, preexec_fn, close_fds, 855 pass_fds, cwd, env, 856 startupinfo, creationflags, shell, ~\anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session) 1305 # Start the process 1306 try: -> 1307 hp, ht, pid, tid = _winapi.CreateProcess(executable, args, 1308 # no special security 1309 None, None, FileNotFoundError: [WinError 2] The system cannot find the file specified
sc = spark.SparkContext
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-14-e3a11d985a77> in <module> ----> 1 sc = spark.SparkContext NameError: name 'spark' is not defined