Reading csv from Amazon s3 using python2.7
I can easily get the bucket name from s3, but when I read the csv file from s3 it gives an error every time.
import boto3
import pandas as pd
s3 = boto3.client('s3',
aws_access_key_id='yyyyyyyy',
aws_secret_access_key='xxxxxxxxxxx')
# Call S3 to list current buckets
response = s3.list_buckets()
for bucket in response['Buckets']:
print bucket['Name']
output
s3-bucket-data
...
import pandas as pd
import StringIO
from boto.s3.connection import S3Connection
AWS_KEY = 'yyyyyyyyyy'
AWS_SECRET = 'xxxxxxxxxx'
aws_connection = S3Connection(AWS_KEY, AWS_SECRET)
bucket = aws_connection.get_bucket('s3-bucket-data')
fileName = "data.csv"
content = bucket.get_key(fileName).get_contents_as_string()
reader = pd.read_csv(StringIO.StringIO(content))
getting error
boto.exception.S3ResponseError: S3ResponseError: 400 Bad Request
How can I read csv from s3?
source to share
you can use s3fs
package
s3fs also supports aws profiles in credential files.
Here's an example (you don't need to break it, but I just used this example),
import os
import pandas as pd
import s3fs
import gzip
chunksize = 999999
usecols = ["Col1", "Col2"]
filename = 'some_csv_file.csv.gz'
s3_bucket_name = 'some_bucket_name'
AWS_KEY = 'yyyyyyyyyy'
AWS_SECRET = 'xxxxxxxxxx'
s3f = s3fs.S3FileSystem(
anon=False,
key=AWS_KEY,
secret=AWS_SECRET)
# or if you have a profile defined in credentials file:
#aws_shared_credentials_file = 'path/to/aws/credentials/file/'
#os.environ['AWS_SHARED_CREDENTIALS_FILE'] = aws_shared_credentials_file
#s3f = s3fs.S3FileSystem(
# anon=False,
# profile_name=s3_profile)
filepath = os.path.join(s3_bucket_name, filename)
with s3f.open(filepath, 'rb') as f:
gz = gzip.GzipFile(fileobj=f) # Decompress data with gzip
chunks = pd.read_csv(gz,
usecols=usecols,
chunksize=chunksize,
iterator=True,
)
df = pd.concat([c for c in chunks], axis=1)
source to share
boto
is what I love when it comes to handling data in S3 with python ..
install boto
withpip install boto
import boto
from boto.s3.key import Key
keyId ="your_aws_key_id"
sKeyId="your_aws_secret_key_id"
srcFileName="abc.txt" # filename on S3
destFileName="s3_abc.txt" # output file name
bucketName="mybucket001" # S3 bucket name
conn = boto.connect_s3(keyId,sKeyId)
bucket = conn.get_bucket(bucketName)
#Get the Key object of the given key, in the bucket
k = Key(bucket,srcFileName)
#Get the contents of the key into a file
k.get_contents_to_filename(destFileName)
source to share
I faced this issue with multiple AWS regions. I created a bucket in "us-east-1" and the following code worked fine:
import boto
from boto.s3.key import Key
import StringIO
import pandas as pd
keyId ="xxxxxxxxxxxxxxxxxx"
sKeyId="yyyyyyyyyyyyyyyyyy"
srcFileName="zzzzz.csv"
bucketName="elasticbeanstalk-us-east-1-aaaaaaaaaaaa"
conn = boto.connect_s3(keyId,sKeyId)
bucket = conn.get_bucket(bucketName)
k = Key(bucket,srcFileName)
content = k.get_contents_as_string()
reader = pd.read_csv(StringIO.StringIO(content))
Try creating a new bucket in us-east-1 and see if it works.
source to share
Try the following:
import boto3
from boto3 import session
import pandas as pd
import io
session = boto3.session.Session(region_name='XXXX')
s3client = session.client('s3', config =
boto3.session.Config(signature_version='XXXX'))
response = s3client.get_object(Bucket='myBucket', Key='myKey')
dataset = pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')
source to share