XGBoostError: b '[19:12:58] src / metric / rank_metric.cc: 89: Check failed: (preds.size ()) == (info.labels.size ()) the size of the prediction of the label size does not match'

I am training XGBoostClassifier for my training set.

My training functions are of the form (45001, 10338), which is a numpy array, and my training shortcuts are of the form (45001). [I have 1161 unique labels, so I did the label encoding for the labels] which is also a numpy array.

It is clear from the documentation that I can create a DMatrix from a numpy array. So I am using the above training and label functions as numpy arrays right away. But I am getting the following error:

---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-30-3de36245534e> in <module>()
     13  scale_pos_weight=1,
     14  seed=27)
---> 15 modelfit(xgb1, train_x, train_y)

<ipython-input-27-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
      6         xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
      7         cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8             metrics='auc',early_stopping_rounds=early_stopping_rounds)
      9         alg.set_params(n_estimators=cvresult.shape[0])
     10 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
    399         for fold in cvfolds:
    400             fold.update(i, obj)
--> 401         res = aggcv([f.eval(i, feval) for f in cvfolds])
    402 
    403         for key, mean, std in res:

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in <listcomp>(.0)
    399         for fold in cvfolds:
    400             fold.update(i, obj)
--> 401         res = aggcv([f.eval(i, feval) for f in cvfolds])
    402 
    403         for key, mean, std in res:

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in eval(self, iteration, feval)
    221     def eval(self, iteration, feval):
    222         """"Evaluate the CVPack for one iteration."""
--> 223         return self.bst.eval_set(self.watchlist, iteration, feval)
    224 
    225 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in eval_set(self, evals, iteration, feval)
    865             _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration,
    866                                                   dmats, evnames, len(evals),
--> 867                                                   ctypes.byref(msg)))
    868             return msg.value
    869         else:

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
    125     """
    126     if ret != 0:
--> 127         raise XGBoostError(_LIB.XGBGetLastError())
    128 
    129 

XGBoostError: b'[19:12:58] src/metric/rank_metric.cc:89: Check failed: (preds.size()) == (info.labels.size()) label size predict size not match'

      

Please find my code model below:

def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgb_param['num_class'] = 1161   
        xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc',early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(train_data_features, train_labels, eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(train_data_features)
    dtrain_predprob = alg.predict_proba(train_data_features)[:,1]

    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(train_labels, dtrain_predictions))

      

Where am I going wrong in the above place?

My classifier looks like this:

xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

      

EDIT - 2 After changing the estimated metric

---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-9-30c62a886c2e> in <module>()
     13  scale_pos_weight=1,
     14  seed=27)
---> 15 modelfit(xgb1, train_x_trail, train_y_trail)

<ipython-input-8-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
      6         xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
      7         cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8             metrics='auc',early_stopping_rounds=early_stopping_rounds)
      9         alg.set_params(n_estimators=cvresult.shape[0])
     10 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
    398                            evaluation_result_list=None))
    399         for fold in cvfolds:
--> 400             fold.update(i, obj)
    401         res = aggcv([f.eval(i, feval) for f in cvfolds])
    402 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in update(self, iteration, fobj)
    217     def update(self, iteration, fobj):
    218         """"Update the boosters for one iteration"""
--> 219         self.bst.update(self.dtrain, iteration, fobj)
    220 
    221     def eval(self, iteration, feval):

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
    125     """
    126     if ret != 0:
--> 127         raise XGBoostError(_LIB.XGBGetLastError())
    128 
    129 

XGBoostError: b'[03:43:03] src/objective/multiclass_obj.cc:42: Check failed: (info.labels.size()) != (0) label set cannot be empty'

      

+3


source to share


2 answers


The original error you are getting is that this metric was not designed to classify multiple classes (see here ).

You can use scikit learn wrapper xgboost to overcome this issue. I modified your code with this wrapper to create a similar function. I'm not sure why you are doing gridsearch though, since you are not listing parameters. Instead, you use the parameters specified in xgb1

. Here's the modified code:

import xgboost as xgb
import sklearn
import numpy as np
from sklearn.model_selection import GridSearchCV

def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5):

    if useTrainCV:
        params=alg.get_xgb_params()
        xgb_param=dict([(key,[params[key]]) for key in params])

        boost = xgb.sklearn.XGBClassifier()
        cvresult = GridSearchCV(boost,xgb_param,cv=cv_folds)
        cvresult.fit(X,y)
        alg=cvresult.best_estimator_


    #Fit the algorithm on the data
    alg.fit(train_data_features, train_labels)

    #Predict training set:
    dtrain_predictions = alg.predict(train_data_features)
    dtrain_predprob = alg.predict_proba(train_data_features)[:,1]

    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % sklearn.metrics.accuracy_score(train_labels, dtrain_predictions))

xgb1 = xgb.sklearn.XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)    


X=np.random.normal(size=(200,30))
y=np.random.randint(0,5,200)

modelfit(xgb1, X, y)

      



The output I get is

Model Report
Accuracy : 1

      

Note that I used a much smaller size for the data. With the size you mentioned, the algorithm can be very slow.

+4


source


Error b / c, you are trying to use the AUC score metric for multi-class classification, but AUC is only applicable for two-class problems. In the xgboost implementation, "auc" expects the size of the prediction to be the same as the size of the label, while the size of your multiclass prediction will be 45001 * 1161. Use the metric "mlogloss" or "merror".



PS: currently xgboost is going to be pretty slow with so many classes as there is some inefficiency with prediction caching during training.

+2


source







All Articles