3rd party XML parser (xpath.js) gives error "Do not specify end tag name: div does not match current start tag name"

Using parse.com cloud code, I am trying to clear data from a web page to send to my iOS app. I have already implemented the web cleanup code for iOS, but I am trying to move this task to the backend. I am using node.js library called xpath.js

Parse.Cloud.define("test", function(request, response) {   


        Parse.Cloud.httpRequest({
      url: "http://menu.ha.ucla.edu/foodpro/default.asp",
      success: function(httpResponse) {
        var text = httpResponse.text;
        var xpath = require("cloud/xpath.js"), dom = require("cloud/dom-parser.js").DOMParser;
        var doc = new dom().parseFromString(text);
        var cells = xpath.select("//td[starts-with(@class, 'menugridcell')]", doc);

        response.success("test " + cells.count);
        var listNode = xpath.select("//ul", cells[0])[0]; 
         },
     error: function(httpResponse) {
        console.error('Request failed with response code ' + httpResponse.status);
      }
});
}); 

      

However, when I run the code, I get this error:

"Uncaught end tag name: div is not match the current start tagName:script"

      

As I mentioned earlier, I was able to successfully clean up web data using a separate objective-c library, so the tags are consistent and the problem cannot lie in the original code.

For the source code, here's the webpage I'm scraping . StackOverflow won't let me link directly to the source code, otherwise I would give a direct link.

EDIT:

Here is the code in dom-parser.js

function DOMParser(options){
    this.options = options ||{locator:{}};

}
DOMParser.prototype.parseFromString = function(source,mimeType){    
    var options = this.options;
    var sax =  new XMLReader();
    var domBuilder = options.domBuilder || new DOMHandler();//contentHandler and LexicalHandler
    var errorHandler = options.errorHandler;
    var locator = options.locator;
    var defaultNSMap = options.xmlns||{};
    var entityMap = {'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'"}
    if(locator){
        domBuilder.setDocumentLocator(locator)
    }

    sax.errorHandler = buildErrorHandler(errorHandler,domBuilder,locator);
    sax.domBuilder = options.domBuilder || domBuilder;
    if(/\/x?html?$/.test(mimeType)){
        entityMap.nbsp = '\xa0';
        entityMap.copy = '\xa9';
        defaultNSMap['']= 'http://www.w3.org/1999/xhtml';
    }
    if(source){
        sax.parse(source,defaultNSMap,entityMap);
    }else{
        sax.errorHandler.error("invalid document source");
    }
    return domBuilder.document;
}
function buildErrorHandler(errorImpl,domBuilder,locator){
    if(!errorImpl){
        if(domBuilder instanceof DOMHandler){
            return domBuilder;
        }
        errorImpl = domBuilder ;
    }
    var errorHandler = {}
    var isCallback = errorImpl instanceof Function;
    locator = locator||{}
    function build(key){
        var fn = errorImpl[key];
        if(!fn){
            if(isCallback){
                fn = errorImpl.length == 2?function(msg){errorImpl(key,msg)}:errorImpl;
            }else{
                var i=arguments.length;
                while(--i){
                    if(fn = errorImpl[arguments[i]]){
                        break;
                    }
                }
            }
        }
        errorHandler[key] = fn && function(msg){
            fn(msg+_locator(locator));
        }||function(){};
    }
    build('warning','warn');
    build('error','warn','warning');
    build('fatalError','warn','warning','error');
    return errorHandler;
}
/**
 * +ContentHandler+ErrorHandler
 * +LexicalHandler+EntityResolver2
 * -DeclHandler-DTDHandler 
 * 
 * DefaultHandler:EntityResolver, DTDHandler, ContentHandler, ErrorHandler
 * DefaultHandler2:DefaultHandler,LexicalHandler, DeclHandler, EntityResolver2
 * @link http://www.saxproject.org/apidoc/org/xml/sax/helpers/DefaultHandler.html
 */
function DOMHandler() {
    this.cdata = false;
}
function position(locator,node){
    node.lineNumber = locator.lineNumber;
    node.columnNumber = locator.columnNumber;
}
/**
 * @see org.xml.sax.ContentHandler#startDocument
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
 */ 
DOMHandler.prototype = {
    startDocument : function() {
        this.document = new DOMImplementation().createDocument(null, null, null);
        if (this.locator) {
            this.document.documentURI = this.locator.systemId;
        }
    },
    startElement:function(namespaceURI, localName, qName, attrs) {
        var doc = this.document;
        var el = doc.createElementNS(namespaceURI, qName||localName);
        var len = attrs.length;
        appendElement(this, el);
        this.currentElement = el;

        this.locator && position(this.locator,el)
        for (var i = 0 ; i < len; i++) {
            var namespaceURI = attrs.getURI(i);
            var value = attrs.getValue(i);
            var qName = attrs.getQName(i);
            var attr = doc.createAttributeNS(namespaceURI, qName);
            if( attr.getOffset){
                position(attr.getOffset(1),attr)
            }
            attr.value = attr.nodeValue = value;
            el.setAttributeNode(attr)
        }
    },
    endElement:function(namespaceURI, localName, qName) {
        var current = this.currentElement
        var tagName = current.tagName;
        this.currentElement = current.parentNode;
    },
    startPrefixMapping:function(prefix, uri) {
    },
    endPrefixMapping:function(prefix) {
    },
    processingInstruction:function(target, data) {
        var ins = this.document.createProcessingInstruction(target, data);
        this.locator && position(this.locator,ins)
        appendElement(this, ins);
    },
    ignorableWhitespace:function(ch, start, length) {
    },
    characters:function(chars, start, length) {
        chars = _toString.apply(this,arguments)
        //console.log(chars)
        if(this.currentElement && chars){
            if (this.cdata) {
                var charNode = this.document.createCDATASection(chars);
                this.currentElement.appendChild(charNode);
            } else {
                var charNode = this.document.createTextNode(chars);
                this.currentElement.appendChild(charNode);
            }
            this.locator && position(this.locator,charNode)
        }
    },
    skippedEntity:function(name) {
    },
    endDocument:function() {
        this.document.normalize();
    },
    setDocumentLocator:function (locator) {
        if(this.locator = locator){// && !('lineNumber' in locator)){
            locator.lineNumber = 0;
        }
    },
    //LexicalHandler
    comment:function(chars, start, length) {
        chars = _toString.apply(this,arguments)
        var comm = this.document.createComment(chars);
        this.locator && position(this.locator,comm)
        appendElement(this, comm);
    },

    startCDATA:function() {
        //used in characters() methods
        this.cdata = true;
    },
    endCDATA:function() {
        this.cdata = false;
    },

    startDTD:function(name, publicId, systemId) {
        var impl = this.document.implementation;
        if (impl && impl.createDocumentType) {
            var dt = impl.createDocumentType(name, publicId, systemId);
            this.locator && position(this.locator,dt)
            appendElement(this, dt);
        }
    },
    /**
     * @see org.xml.sax.ErrorHandler
     * @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
     */
    warning:function(error) {
        console.warn(error,_locator(this.locator));
    },
    error:function(error) {
        console.error(error,_locator(this.locator));
    },
    fatalError:function(error) {
        console.error(error,_locator(this.locator));
        throw error;
    }
}
function _locator(l){
    if(l){
        return '\n@'+(l.systemId ||'')+'#[line:'+l.lineNumber+',col:'+l.columnNumber+']'
    }
}
function _toString(chars,start,length){
    if(typeof chars == 'string'){
        return chars.substr(start,length)
    }else{//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
        if(chars.length >= start+length || start){
            return new java.lang.String(chars,start,length)+'';
        }
        return chars;
    }
}

/*
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
 * used method of org.xml.sax.ext.LexicalHandler:
 *  #comment(chars, start, length)
 *  #startCDATA()
 *  #endCDATA()
 *  #startDTD(name, publicId, systemId)
 *
 *
 * IGNORED method of org.xml.sax.ext.LexicalHandler:
 *  #endDTD()
 *  #startEntity(name)
 *  #endEntity(name)
 *
 *
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
 * IGNORED method of org.xml.sax.ext.DeclHandler
 *  #attributeDecl(eName, aName, type, mode, value)
 *  #elementDecl(name, model)
 *  #externalEntityDecl(name, publicId, systemId)
 *  #internalEntityDecl(name, value)
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
 * IGNORED method of org.xml.sax.EntityResolver2
 *  #resolveEntity(String name,String publicId,String baseURI,String systemId)
 *  #resolveEntity(publicId, systemId)
 *  #getExternalSubset(name, baseURI)
 * @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
 * IGNORED method of org.xml.sax.DTDHandler
 *  #notationDecl(name, publicId, systemId) {};
 *  #unparsedEntityDecl(name, publicId, systemId, notationName) {};
 */
"endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl".replace(/\w+/g,function(key){
    DOMHandler.prototype[key] = function(){return null}
})

/* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
function appendElement (hander,node) {
    if (!hander.currentElement) {
        hander.document.appendChild(node);
    } else {
        hander.currentElement.appendChild(node);
    }
}//appendChild and setAttributeNS are preformance key

if(typeof require == 'function'){
    var XMLReader = require('cloud/sax').XMLReader;
    var DOMImplementation = exports.DOMImplementation = require('cloud/dom').DOMImplementation;
    exports.XMLSerializer = require('cloud/dom').XMLSerializer ;
    exports.DOMParser = DOMParser;
}

      

+3


source to share


1 answer


this page contains some XML tags in html script. Source tags can be ignored because they contain extracted qoutation characters. The parser finds </div>

(on a line in the script) and tries to put it with the opening <script>

and fails. You are a parser trying to read XML and don't know that the xhtml script scope is CData.

You have to tell the syntax to ignore (or read as CData) the script tag. Sorry, but I didn't know how to do this.



Best wishes Maggio

0


source







All Articles