Tensorflow NLTK Sentiment Analysis Prediction Error

Question

Tensorflow NLTK Sentiment Analysis Prediction Error

I am studying sentiment analysis using the Tensorflow framework.

I am following the tutorials from pythonprogramming_tutorial (create_feature_sets_and_labels) and pythonprogramming_tutorial (train_test)

In create_sentiment_featuresets.py (1st link) I added only one method to extract vocabulary and modified the code given by sentiment_demo.py (2nd link) to test the feel of a given input string.

create_sentiment_featuresets.py

import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
hm_lines = 100000
def create_lexicon(pos, neg):

    lexicon = []
    with open(pos, 'r') as f:
        contents = f.readlines()            # readline vs strip
        for l in contents[:len(contents)]:
            l= l.decode('utf-8')
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    f.close()

    with open(neg, 'r') as f:
        contents = f.readlines()            # readline vs strip
        for l in contents[:len(contents)]:
            l= l.decode('utf-8')
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    f.close()

    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)
    #print(len(w_counts))
    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    #print(len(l2))
    #print(l2)
    print("Lexicon length create_lexicon: ",len(lexicon))

    return l2

def sample_handling(sample, lexicon, classification):

    featureset = []
    print("Lexicon length Sample handling: ",len(lexicon))
    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:len(contents)]:
            l= l.decode('utf-8')
            current_words = word_tokenize(l.lower())
            current_words= [lemmatizer.lemmatize(i) for i in current_words]

            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] +=1

            features = list(features)
            featureset.append([features, classification])
    f.close()
    print("Feature SET------")
    print(len(featureset))
    return featureset

def create_feature_sets_and_labels(pos, neg, test_size = 0.1):
    global m_lexicon
    m_lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling(pos, m_lexicon, [1,0])
    features += sample_handling(neg, m_lexicon, [0,1])

    random.shuffle(features)
    features = np.array(features)

    testing_size = int(test_size * len(features))

    train_x = list(features[:,0][:-testing_size])
    #print("TRAIN_X", train_x)
    train_y = list(features[:,1][:-testing_size])
    #print("TRAIN_Y", train_y)
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])

    return train_x, train_y, test_x, test_y

def get_lexicon():
    global m_lexicon
    return m_lexicon

For training and testing, I am using pos.txt and neg.txt mentioned in the first link. Files contain 5000 sentences positive and negative respectfully

Below is my sentiment_demo.py:

from create_sentiment_featuresets import create_feature_sets_and_labels
from create_sentiment_featuresets import get_lexicon

import tensorflow as tf
import pickle
import numpy as np

# extras for testing
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
#- end extras


train_x, train_y, test_x, test_y = create_feature_sets_and_labels('pos.txt', 'neg.txt')

n_nodes_hl1 = 1500
n_nodes_hl2 = 1500
n_nodes_hl3 = 1500

n_classes = 2
batch_size = 100
hm_epochs = 5

x = tf.placeholder('float')
y = tf.placeholder('float')

hidden_1_layer = {'f_fum': n_nodes_hl1,
                'weight': tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hl1])),
                'bias': tf.Variable(tf.random_normal([n_nodes_hl1]))}
hidden_2_layer = {'f_fum': n_nodes_hl2,
                'weight': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                'bias': tf.Variable(tf.random_normal([n_nodes_hl2]))}
hidden_3_layer = {'f_fum': n_nodes_hl3,
                'weight': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                'bias': tf.Variable(tf.random_normal([n_nodes_hl3]))}
output_layer = {'f_fum': None,
                'weight': tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                'bias': tf.Variable(tf.random_normal([n_classes]))}


def nueral_network_model(data):

    l1 = tf.add(tf.matmul(data, hidden_1_layer['weight']), hidden_1_layer['bias'])
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1, hidden_2_layer['weight']), hidden_2_layer['bias'])
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2, hidden_3_layer['weight']), hidden_3_layer['bias'])
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3, output_layer['weight']) + output_layer['bias']

    return output

def train_neural_network(x):
    prediction = nueral_network_model(x)
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits= prediction, labels= y))
    optimizer = tf.train.AdamOptimizer(learning_rate= 0.001).minimize(cost)



    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(hm_epochs):
            epoch_loss = 0
            i = 0
            while i < len(train_x):
                start = i
                end = i+ batch_size
                batch_x = np.array(train_x[start: end])
                batch_y = np.array(train_y[start: end])

                _, c = sess.run([optimizer, cost], feed_dict= {x: batch_x, y: batch_y})
                epoch_loss += c
                i+= batch_size
            print('Epoch', epoch+ 1, 'completed out of ', hm_epochs, 'loss:', epoch_loss)

        correct= tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

        print('Accuracy:', accuracy.eval({x:test_x, y:test_y}))


        # testing ------Trying to predict the sentiment for an input string--------
        m_lexicon= get_lexicon()
        print('Lexicon length: ',len(m_lexicon))

        input_data= "He is an idiot"

        current_words= word_tokenize(input_data.lower())
        current_words = [lemmatizer.lemmatize(i) for i in current_words]
        features = np.zeros(len(m_lexicon))

        for word in current_words:
            if word.lower() in m_lexicon:
                index_value = m_lexicon.index(word.lower())
                features[index_value] +=1

        features = np.array(list(features))
        print('features length: ',len(features))
        result = sess.run(tf.argmax(prediction.eval(feed_dict={x:features}), 1))
        print('RESULT: ', result)
        if result[0] == 0:
            print('Positive: ', input_data)
        elif result[0] == 1:
            print('Negative: ', input_data)


train_neural_network(x)

Progam runs until the epoch loss is paused, after which it gives the following error:

('Epoch', 1, 'completed out of ', 5, 'loss:', 1289814.4057617188)
('Epoch', 2, 'completed out of ', 5, 'loss:', 457882.97705078125)
('Epoch', 3, 'completed out of ', 5, 'loss:', 243073.83074951172)
('Epoch', 4, 'completed out of ', 5, 'loss:', 245525.22399902344)
('Epoch', 5, 'completed out of ', 5, 'loss:', 233219.91000366211)
('Accuracy:', 0.59287059)
('Lexicon length: ', 423)
('features length: ', 423)
Traceback (most recent call last):
  File "sentiment_demo.py", line 110, in <module>
train_neural_network(x)
  File "sentiment_demo.py", line 102, in train_neural_network
result = sess.run(tf.argmax(prediction.eval(feed_dict={x:features}), 1))
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 569, in eval
return _eval_using_default_session(self, feed_dict, self.graph, session)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3741, in _eval_using_default_session
return session.run(tensors, feed_dict)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 778, in run
run_metadata_ptr)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 982, in _run
feed_dict_string, options, run_metadata)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1032, in _do_run
target_list, options, run_metadata)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1052, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: In[0] is not a matrix
 [[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](_recv_Placeholder_0/_23, Variable/read)]]
 [[Node: add/_25 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_4_add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op u'MatMul', defined at:
  File "sentiment_demo.py", line 110, in <module>
    train_neural_network(x)
  File "sentiment_demo.py", line 58, in train_neural_network
    prediction = nueral_network_model(x)
  File "sentiment_demo.py", line 44, in nueral_network_model
    l1 = tf.add(tf.matmul(data, hidden_1_layer['weight']), hidden_1_layer['bias'])
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 1801, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1263, in _mat_mul
transpose_b=transpose_b, name=name)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
op_def=op_def)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
  File "/home/lsmpc/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): In[0] is not a matrix
 [[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](_recv_Placeholder_0/_23, Variable/read)]]
 [[Node: add/_25 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_4_add", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

The error above indicates this:

Caused by op u'MatMul', defined at:
  File "sentiment_demo.py", line 110, in <module>
    train_neural_network(x)
  File "sentiment_demo.py", line 58, in train_neural_network
    prediction = nueral_network_model(x)
  File "sentiment_demo.py", line 44, in nueral_network_model
    l1 = tf.add(tf.matmul(data, hidden_1_layer['weight']), hidden_1_layer['bias'])

I'm new to this and I can't seem to fix it.

+3

python python-2.7 nltk tensorflow sentiment-analysis

LinuxBeginner 01 june 17 at 11:55

source to share

1 answer

avloss · Accepted Answer · 2017-06-01T13:02:41+0000

It looks like yours features

has the wrong shape. try this:

    features = np.array(list(features)).reshape(1,-1)

Your model accepts batch data, so if you only want to run one prediction you need to reformat it as batch 1. Good luck!

Tensorflow NLTK Sentiment Analysis Prediction Error

More articles: