Easiest way to import a simple csv file into a graph using OrientDB ETL
I would like to import a very simple directed graph csv file in OrientDB. Specifically, the file is the roadNet-PA dataset from the SNAP collection https://snap.stanford.edu/data/roadNet-PA.html . The first lines of the file are as follows:
# Directed graph (each unordered pair of nodes is saved once)
# Pennsylvania road network
# Nodes: 1088092 Edges: 3083796
# FromNodeId ToNodeId
0 1
0 6309
0 6353
1 0
6353 0
6353 6354
There is only one type of vertex (road intersection) and the edges have no information (I believe OrientDB lightweight edges are best for this). Also note that the vertices are tabbed.
I tried to create a simple etl to import the file with no success. Like this:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roadNet-PA.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": " ", "skipFrom": 1, "skipTo": 4 } },
{ "vertex": { "class": "Intersection" } },
{ "edge": { "class": "Road" } }
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["id:integer"], "type":"UNIQUE" }
]
}
}
}
Etl works, but it doesn't import the file as I expect. I suppose the problem is with the transformers. My idea is to read the csv line by line and create both edges connecting both vertices, but I'm not sure how to express this in the etl file. Any ideas?
source to share
Try the following:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roadNet-PA.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": "\t", "skipFrom": 1, "skipTo": 4,
"columnsOnFirstLine": false,
"columns":["id", "to"] } },
{ "vertex": { "class": "Intersection" } },
{ "merge": { "joinFieldName":"id", "lookup":"Intersection.id" } },
{ "edge": {
"class": "Road",
"joinFieldName": "to",
"lookup": "Intersection.id",
"unresolvedLinkAction": "CREATE"
}
},
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"wal": false,
"batchCommit": 1000,
"tx": true,
"txUseLog": false,
"useLightweightEdges" : true,
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["id:integer"], "type":"UNIQUE" }
]
}
}
}
To speed up loading, I suggest you shut down the server and import the ETL using "plocal:" instead of "remote:". An example of replacing an existing one:
"dbURL": "plocal:/orientdb/databases/roads",
source to share
Finally it worked. I moved the merge in front of the vertex row as Luka suggested. I also changed the "id" field to "from" to avoid the error. "The property key is reserved for all id elements." Here's a snippet:
{
"config": {
"log": "debug"
},
"source" : {
"file": { "path": "/tmp/roads.csv" }
},
"extractor": { "row": {} },
"transformers": [
{ "csv": { "separator": "\t",
"columnsOnFirstLine": false,
"columns":["from", "to"] } },
{ "merge": { "joinFieldName":"from", "lookup":"Intersection.from" } },
{ "vertex": { "class": "Intersection" } },
{ "edge": {
"class": "Road",
"joinFieldName": "to",
"lookup": "Intersection.from",
"unresolvedLinkAction": "CREATE"
}
},
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/roads",
"dbType": "graph",
"wal": false,
"batchCommit": 1000,
"tx": true,
"txUseLog": false,
"useLightweightEdges" : true,
"classes": [
{"name": "Intersection", "extends": "V"},
{"name": "Road", "extends": "E"}
], "indexes": [
{"class":"Intersection", "fields":["from:integer"], "type":"UNIQUE" }
]
}
}
}
source to share