Py4JJavaError when connecting to Cassandra from Spark
How can I connect to Spark's Cassandra from iPython?
I followed the code here and changed it,
import os
import sys
# Path for spark source folder
os.environ['SPARK_HOME'] = "C:\Apache\spark-1.4.1"
# Append pyspark to Python Path
sys.path.append("C:\Apache\spark-1.4.1\python")
from pyspark import SparkContext
from pyspark.sql import SQLContext
host = 'localhost'
keyspace = 'demo'
cf = 'users'
sc = SparkContext(appName="CassandraInputFormat")
conf = {"cassandra.input.thrift.address": host,
"cassandra.input.thrift.port": "9160",
"cassandra.input.keyspace": keyspace,
"cassandra.input.columnfamily": cf,
"cassandra.input.partitioner.class": "Murmur3Partitioner",
"cassandra.input.page.row.size": "3"}
cass_rdd = sc.newAPIHadoopRDD(
"org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat",
"java.util.Map",
"java.util.Map",
keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter",
valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter",
conf=conf)
output = cass_rdd.collect()
When I run it from ipython notebook I get this error below,
--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-15-458818fce35c> in <module>()
29 keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter",
30 valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter",
---> 31 conf=conf)
32
33 output = cass_rdd.collect()
C:\Apache\spark-1.4.1\python\pyspark\context.pyc in newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, conf, batchSize)
599 jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
600 valueClass, keyConverter, valueConverter,
--> 601 jconf, batchSize)
602 return RDD(jrdd, self)
603
C:\Users\lauthiamkok\inotebook\lib\site-packages\py4j-0.9-py2.7.egg\py4j\java_gateway.pyc in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
C:\Users\lauthiamkok\inotebook\lib\site-packages\py4j-0.9-py2.7.egg\py4j\protocol.pyc in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
And further...
> Py4JJavaError: An error occurred while calling
> z:org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD. :
> java.lang.ClassNotFoundException:
> org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat at
> java.net.URLClassLoader.findClass(Unknown Source) at
> java.lang.ClassLoader.loadClass(Unknown Source) at
> java.lang.ClassLoader.loadClass(Unknown Source) at
> java.lang.Class.forName0(Native Method) at
> java.lang.Class.forName(Unknown Source) at
> org.apache.spark.util.Utils$.classForName(Utils.scala:179) at
> org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDDFromClassNames(PythonRDD.scala:520)
> at
> org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDD(PythonRDD.scala:503)
> at
> org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD(PythonRDD.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
> sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) at
> java.lang.reflect.Method.invoke(Unknown Source) at
> py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) at
> py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) at
> py4j.Gateway.invoke(Gateway.java:259) at
> py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
> at py4j.commands.CallCommand.execute(CallCommand.java:79) at
> py4j.GatewayConnection.run(GatewayConnection.java:207) at
> java.lang.Thread.run(Unknown Source)
Any ideas what I missed?
+3
source to share
1 answer
It looks like the reason: java.lang.ClassNotFoundException
which means the jar was not found and the jar path is not being set correctly.
Can you run the examples from the command line?
Run with example jar:
./bin/spark-submit --driver-class-path /path/to/example/jar \
/path/to/examples/cassandra_inputformat.py <host> <keyspace> <cf>
0
source to share