Error while grabbing table data from website
I am trying to grab some inventory related data from the internet for my project. I faced two problems.
Problem 1:
I tried to grab the table from this site http://sharesansar.com/c/today-share-price.html
It worked but the columns were not grabbed ok. For example, the Company Name column has values โโfor Open Price. How can I solve this?
Problem 2:
I also tried to get company specific data from http://merolagani.com/CompanyDetail.aspx?symbol=ADBL under "Price History".
This time I got an error grabbing table data. The error I received is:
self.data[key].append(cols[index].get_text())
IndexError: list index out of range
The code looks like this:
import logging
import requests
from bs4 import BeautifulSoup
import pandas
module_logger = logging.getLogger('mainApp.dataGrabber')
class DataGrabberTable:
''' Grabs the table data from a certain url. '''
def __init__(self, url, csvfilename, columnName=[], tableclass=None):
module_logger.info("Inside 'DataGrabberTable' constructor.")
self.pgurl = url
self.tableclass = tableclass
self.csvfile = csvfilename
self.columnName = columnName
self.tableattrs = {'class':tableclass} #to be passed in find()
module_logger.info("Done.")
def run(self):
'''Call this to run the datagrabber. Returns 1 if error occurs.'''
module_logger.info("Inside 'DataGrabberTable.run()'.")
try:
self.rawpgdata = (requests.get(self.pgurl, timeout=5)).text
except Exception as e:
module_logger.warning('Error occured: {0}'.format(e))
return 1
#module_logger.info('Headers from the server:\n {0}'.format(self.rawpgdata.headers))
soup = BeautifulSoup(self.rawpgdata, 'lxml')
module_logger.info('Connected and parsed the data.')
table = soup.find('table',attrs = self.tableattrs)
rows = table.find_all('tr')[1:]
#initializing a dict in a format below
# data = {'col1' : [...], 'col2' : [...], }
#col1 and col2 are from columnName list
self.data = {}
self.data = dict(zip(self.columnName, [list() for i in range(len(self.columnName))]))
module_logger.info('Inside for loop.')
for row in rows:
cols = row.find_all('td')
index = 0
for key in self.data:
if index > len(cols): break
self.data[key].append(cols[index].get_text())
index += 1
module_logger.info('Completed the for loop.')
self.dataframe = pandas.DataFrame(self.data) #make pandas dataframe
module_logger.info('writing to file {0}'.format(self.csvfile))
self.dataframe.to_csv(self.csvfile)
module_logger.info('written to file {0}'.format(self.csvfile))
module_logger.info("Done.")
return 0
def getData(self):
""""Returns 'data' dictionary."""
return self.data
# Usage example
def main():
url = "http://sharesansar.com/c/today-share-price.html"
classname = "table"
fname = "data/sharesansardata.csv"
cols = [str(i) for i in range(18)] #make a list of columns
'''cols = [
'S.No', 'Company Name', 'Symbol', 'Open price', 'Max price',
'Min price','Closing price', 'Volume', 'Previous closing',
'Turnover','Difference',
'Diff percent', 'Range', 'Range percent', '90 days', '180 days',
'360 days', '52 weeks high', '52 weeks low']'''
d = DataGrabberTable(url, fname, cols, classname)
if d.run() is 1:
print('Data grabbing failed!')
else:
print('Data grabbing done.')
if __name__ == '__main__':
main()
A few tips will help. Thank!
source to share
Your peers list is missing an item that contains 19 columns, not 18:
>>> len([str(i) for i in range(18)])
18
Also, you seem to be overcomplicating things. You need to do the following:
import requests
from bs4 import BeautifulSoup
import pandas as pd
price_response = requests.get('http://sharesansar.com/c/today-share-price.html')
price_table = BeautifulSoup(price_response.text, 'lxml').find('table', {'class': 'table'})
price_rows = [[cell.text for cell in row.find_all(['th', 'td'])] for row in price_table.find_all('tr')]
price_df = pd.DataFrame(price_rows[1:], columns=price_rows[0])
com_df = None
for symbol in price_df['Symbol']:
comp_response = requests.get('http://merolagani.com/CompanyDetail.aspx?symbol=%s' % symbol)
comp_table = BeautifulSoup(comp_response.text, 'lxml').find('table', {'class': 'table'})
com_header, com_value = list(), list()
for tbody in comp_table.find_all('tbody'):
comp_row = tbody.find('tr')
com_header.append(comp_row.find('th').text.strip().replace('\n', ' ').replace('\r', ' '))
com_value.append(comp_row.find('td').text.strip().replace('\n', ' ').replace('\r', ' '))
df = pd.DataFrame([com_value], columns=com_header)
com_df = df if com_df is None else pd.concat([com_df, df])
print(price_df)
print(com_df)
source to share