Connecting to Athena using Python and pyathenajdbc

I am trying to connect to AWS Athena using python. I am trying to use pyathenajdbc to achieve this task. The problem I'm running into is getting the connection. When I run the code below, I get an error that says it cannot find the AthenaDriver. (java.lang.RuntimeException: class com.amazonaws.athena.jdbc.AthenaDriver not found). I downloaded this file from AWS and I have verified that it sits in this directory.

from mdpbi.rsi.config import *
from mdpbi.tools.functions import mdpLog
from pkg_resources import resource_string
import argparse
import os
import pyathenajdbc
import sys

SCRIPT_NAME = "Athena_Export"

ATHENA_JDBC_CLASSPATH = "/opt/amazon/athenajdbc/AthenaJDBC41-1.0.0.jar"
EXPORT_OUTFILE = "RSI_Export.txt"
EXPORT_OUTFILE_PATH = os.path.join(WORKINGDIR, EXPORT_OUTFILE)


def get_arg_parser():
    """This function returns the argument parser object to be used with this script"""
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)

    return parser


def main():
    args = get_arg_parser().parse_args(sys.argv[1:])
    logger = mdpLog(SCRIPT_NAME, LOGDIR)

    SQL = resource_string("mdpbi.rsi.athena.resources", "athena.sql")

    conn = pyathenajdbc.connect(
        s3_staging_dir="s3://athena",
        access_key=AWS_ACCESS_KEY_ID,
        secret_key=AWS_SECRET_ACCESS_KEY,
        region_name="us-east-1",
        log_path=LOGDIR,
        driver_path=ATHENA_JDBC_CLASSPATH
    )
    try:
        with conn.cursor() as cursor:
            cursor.execute(SQL)
            logger.info(cursor.description)
            logger.info(cursor.fetchall())
    finally:
        conn.close()

    return 0


if __name__ == '__main__':
    rtn = main()
    sys.exit(rtn)

      

Traceback (last call last): File "/usr/lib64/python2.7/runpy.py", line 174, in _run_module_as_main " main ", fname, loader, pkg_name) File "/usr/lib64/python2.7/runpy .py ", line 72, in _run_code exec in run_globals File / home / ec 2-user / jason_testing / mdpbi / rsi / athena / .py", line 53, in rtn = main () File "/ home / ec 2 -user / jason_testing / mdpbi / rsi / athena / .py ", line 39, mostly driver_path = athena_jdbc_driver_path File" / opt / mdpbi / Python _Envs / 2.7.10 / local / lib / python2.7 / dist-packages / pyathenajdbc / init .py ", line 65, in the driver_path connection, ** kwargs) File" / opt / mdpbi / Python _Envs / 2.7.10 / local / lib / python2.7 / dist-packages / pyathenajdbc / connection.py ", line 68, in init    jpype.JClass (ATHENA_DRIVER_CLASS_NAME) File "/ opt / mdpbi / Python _Envs / 2.7.10 / lib64 / python2.7 / dist-packages / jpype / _jclass.py", line 55, in JClass raise _RUNTIMEEXCEPTION.PYEXC ("Class% s not found '% name)

+3


source to share


2 answers


The JDBC driver requires Java 8. I am currently running Java 7. I was able to install another version of Java on my EC2 instance.

https://tecadmin.net/install-java-8-on-centos-rhel-and-fedora/#



I also needed to install the java version in my code. With these changes, the code now works as expected.

from mdpbi.rsi.config import *
from mdpbi.tools.functions import mdpLog
from pkg_resources import resource_string
import argparse
import os
import pyathenajdbc
import sys

SCRIPT_NAME = "Athena_Export"


def get_arg_parser():
    """This function returns the argument parser object to be used with this script"""
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)

    return parser


def main():
    args = get_arg_parser().parse_args(sys.argv[1:])
    logger = mdpLog(SCRIPT_NAME, LOGDIR)

    SQL = resource_string("mdpbi.rsi.athena.resources", "athena.sql")

    os.environ["JAVA_HOME"] = "/opt/jdk1.8.0_121"
    os.environ["JRE_HOME"] = "/opt/jdk1.8.0_121/jre"
    os.environ["PATH"] = "/opt/jdk1.8.0_121/bin:/opt/jdk1.8.0_121/jre/bin"

    conn = pyathenajdbc.connect(
        s3_staging_dir="s3://mdpbi.data.rsi.out/",
        access_key=AWS_ACCESS_KEY_ID,
        secret_key=AWS_SECRET_ACCESS_KEY,
        schema_name="rsi",
        region_name="us-east-1"
    )
    try:
        with conn.cursor() as cursor:
            cursor.execute(SQL)
            logger.info(cursor.description)
            logger.info(cursor.fetchall())
    finally:
        conn.close()

    return 0


if __name__ == '__main__':
    rtn = main()
    sys.exit(rtn)

      

+4


source


Try the following:

    pyathenajdbc.ATHENA_JAR = ATHENA_JDBC_CLASSPATH

      



You don't need to specify the driver_path argument in the connect method

+2


source







All Articles