tableInsert/insert into
本小节介绍如何在 C++ API 端通过调用DolphinDB 的 tableInsert
函数以及 insert
into
语句来插入数据。这是一种同步的写入方式,支持单条以及批量写入。
保存数据到内存表
环境准备
通过DolphinDB脚本创建一个共享内存表
t = table(100:0, `name`date`price, [STRING, DATE, DOUBLE]);
share t as tglobal;
使用 insert into
语句保存数据
保存单条数据:
std::string assembleScript(const std::string& name, int date, double price){
return string("insert into tglobal values(`").append(name).append(",").append(std::to_string(date)).append(",").append(std::to_string(price).append(")"));
}
int main(int argc, const char **argv)
{
DBConnection conn;
conn.connect("127.0.0.1", 8848, "admin", "123456");
conn.run(assembleScript("Tom", 0, 1.2));
conn.run(assembleScript("Lily", 1, 2.2));
conn.run(assembleScript("Lucy", 2, 3.2));
std::cout << conn.run("tglobal")->getString() << std::endl;
}
保存多条数据:
int main(int argc, const char **argv)
{
DBConnection conn;
conn.connect("127.0.0.1", 8848, "admin", "123456");
string script;
int rowNum=10000, indexCapacity=10000;
VectorSP names = Util::createVector(DT_STRING, rowNum, indexCapacity);
VectorSP dates = Util::createVector(DT_DATE, rowNum, indexCapacity);
VectorSP prices = Util::createVector(DT_DOUBLE, rowNum, indexCapacity);
int array_dt_buf[Util:: BUF_SIZE]; //定义date列缓冲区数组
double array_db_buf[Util:: BUF_SIZE]; //定义price列缓冲区数组
int start = 0;
int no=0;
while (start < rowNum) {
size_t len = std::min(Util::BUF_SIZE, rowNum - start);
int *dtp = dates->getIntBuffer(start, len, array_dt_buf); //dtp指向每次通过 `getIntBuffer` 得到的缓冲区的头部
double *dbp = prices->getDoubleBuffer(start, len, array_db_buf); //dbp指向每次通过 `getDoubleBuffer` 得到的缓冲区的头部
for (size_t i = 0; i < len; i++) {
names->setString(i+start, "name_"+std::to_string(++no)); //对string类型的name列直接进行赋值,不采用getbuffer的方式
dtp[i] = 17898+i;
dbp[i] = (rand()%100)/3.0;
}
dates->setInt(start, len, dtp); //写完后使用 `setInt` 将缓冲区写回数组
prices->setDouble(start, len, dbp); //写完后使用 `setDouble` 将缓冲区写回数组
start += len;
}
vector<string> allnames = {"names", "dates", "prices"};
vector<ConstantSP> allcols = {names, dates, prices};
conn.upload(allnames, allcols);
script += "insert into tglobal values(names, dates, prices); tglobal";
TableSP table = conn.run(script);
}
使用 tableInsert
保存 TableSP 对象
TableSP createDemoTable(){
vector<string> colNames = {"name", "date","price"};
vector<DATA_TYPE> colTypes = {DT_STRING, DT_DATE, DT_DOUBLE};
int colNum = 3, rowNum = 50, indexCapacity = 50;
ConstantSP table = Util::createTable(colNames, colTypes, rowNum, indexCapacity);
vector<VectorSP> columnVecs;
for(int i = 0; i < colNum; ++i)
columnVecs.push_back(table->getColumn(i));
for(int i = 0; i < rowNum; ++i){
columnVecs[0]->set(i, Util::createString("name_"+std::to_string(i)));
columnVecs[1]->set(i, Util::createDate(2010, 1, i+1));
columnVecs[2]->set(i, Util::createDouble((rand()%100)/3.0));
}
return table;
}
int main(int argc, const char **argv)
{
DBConnection conn;
conn.connect("127.0.0.1", 8848, "admin", "123456");
vector<ConstantSP> args;
TableSP table = createDemoTable();
args.push_back(table);
conn.run("tableInsert{tglobal}", args);
}
保存数据到分布式表
单线程写入
- 环境准备。通过 DolphinDB
脚本创建一个分布式表:
dbPath = "dfs://SAMPLE_TRDDB"; tableName = `demoTable db = database(dbPath, VALUE, 2010.01.01..2010.01.30) pt=db.createPartitionedTable(table(1000000:0, `name`date `price, [STRING,DATE,DOUBLE]), tableName, `date)
- C++
代码如下:
TableSP createDemoTable(){ vector<string> colNames = {"name", "date","price"}; vector<DATA_TYPE> colTypes = {DT_STRING, DT_DATE, DT_DOUBLE}; int colNum = 3, rowNum = 10, indexCapacity = 10; ConstantSP table = Util::createTable(colNames, colTypes, rowNum, indexCapacity); vector<VectorSP> columnVecs; for(int i = 0; i < colNum; ++i) columnVecs.push_back(table->getColumn(i)); for(int i = 0; i < rowNum; ++i){ columnVecs[0]->set(i, Util::createString("name_"+std::to_string(i))); columnVecs[1]->set(i, Util::createDate(2010, 1, i+1)); columnVecs[2]->set(i, Util::createDouble((rand()%100)/3.0)); } return table; } int main(int argc, const char **argv) { DBConnection conn; conn.connect("127.0.0.1", 8848, "admin", "123456"); TableSP table = createDemoTable(); vector<ConstantSP> args; args.push_back(table); conn.run("tableInsert{loadTable('dfs://SAMPLE_TRDDB', `demoTable)}", args); }
并发写入
- 环境准备。 通过 DolphinDB 脚本创建一个分布式表,创建分布式数据库 "dfs://natlog" 和分布式表
"natlogrecords"。其中,数据库按照 VALUE-HASH-HASH
的组合进行三级分区:
dbName="dfs://natlog" tableName="natlogrecords" db1 = database("", VALUE, datehour(2019.09.11T00:00:00)..datehour(2019.09.13T00:00:00) )//starttime, newValuePartitionPolicy=add db2 = database("", HASH, [IPADDR, 5]) //source_address db3 = database("", HASH, [IPADDR, 5]) //destination_address db = database(dbName, COMPO, [db1,db2,db3]) data = table(1:0, ["fwname","filename","source_address","source_port","destination_address","destination_port","nat_source_address","nat_source_port","starttime","stoptime","elapsed_time"], [SYMBOL,STRING,IPADDR,INT,IPADDR,INT,IPADDR,INT,DATETIME,DATETIME,INT]) db.createPartitionedTable(data,tableName,`starttime`source_address`destination_address)
- DolphinDB 不允许多个 writer
同时将数据写入到同一个分区,因此在客户端多线程并行写入数据时,需要确保每个线程分别写入不同的分区。对于按哈希值进行分区的分布式表, DolphinDB
C++ API 提供了
getHash
函数来数据的 hash 值。在客户端设计多线程并发写入分布式表时,可根据哈希分区字段数据的哈希值分组,每组指定一个写线程。这样就能保证每个线程同时将数据写到不同的哈希分区。#include "Concurrent.h" #include "DolphinDB.h" #include "Util.h" #include <iostream> #include <sstream> #include <string> #include <sys/time.h> #include <thread> #include <arpa/inet.h> using namespace dolphindb; using namespace std; #define BUCKETS 50 #define MAX_THREAD_NUM BUCKETS DBConnection conn[MAX_THREAD_NUM]; SmartPointer<BoundedBlockingQueue<TableSP>> tableQueue[MAX_THREAD_NUM]; //参数结构体,存放传入的各项参数 struct parameter { int index; int count; long cLong; long nLong; long nTime; long nStarttime; }; void showUsage() { cout << "DolpinDB Multi-threaded performance test program" << endl; cout << "Usage example:--h=127.0.0.1 --p=8921 --c=1000 --n=5 --t=5 --s=1579080800000" << endl; cout << "Options :" << endl; cout << " --h=127.0.0.1 Mandatory,dolphindb host, Multiple hosts separated by commas" << endl; cout << " --p=8921 Mandatory,dolphindb port, Multiple ports separated by commas.The number of ports should be the same of hosts!" << endl; cout << " --c=1000 Mandatory,The number of records inserted per thread" << endl; cout << " --n=5 Optional,Batches insertions per thread,default is 1" << endl; cout << " --t=5 Optional,Threads number,default is 1,max is " << BUCKETS << endl; cout << " --s=1574380800 Optional,start time,default is " << Util::getEpochTime() / 1000 << endl; cout << " --help Print this help." << endl; } //为不同的分区生成不同的表 TableSP createDemoTable(long rows, long startPartition, long partitionCount, long startTime, int timeInc) { vector<string> colNames = {"fwname", "filename", "source_address", "source_port", "destination_address", "destination_port", "nat_source_address", "nat_source_port", "starttime", "stoptime", "elapsed_time"}; vector<DATA_TYPE> colTypes = {DT_SYMBOL, DT_STRING, DT_IP, DT_INT, DT_IP, DT_INT, DT_IP, DT_INT, DT_DATETIME, DT_DATETIME, DT_INT}; int colNum = 11, rowNum = rows, indexCapacity = rows; ConstantSP table = Util::createTable(colNames, colTypes, rowNum, indexCapacity); vector<VectorSP> columnVecs; for (int i = 0; i < colNum; i++) columnVecs.push_back(table->getColumn(i)); unsigned char sip[16] = {0}; sip[3] = 192; sip[2] = startPartition; sip[1] = partitionCount; ConstantSP spIP = Util::createConstant(DT_IP); for (int j = 1; j < 255; j++) { sip[0] = j; spIP->setBinary(0, 16, sip); //不同的哈希值会写入到不同分区 if (spIP->getHash(BUCKETS) >= startPartition && spIP->getHash(BUCKETS) < startPartition + partitionCount) { break; } } unsigned char ip[16] = {0}; for (int i = 0; i < rowNum; i++) { columnVecs[0]->setString(i, "10.189.45.2:9000"); columnVecs[1]->setString(i, std::to_string(startPartition)); columnVecs[2]->setBinary(i, 16, sip); columnVecs[3]->setInt(i, 1 * i); memcpy(ip, (unsigned char *)&i, 4); columnVecs[4]->setBinary(i, 16, ip); columnVecs[5]->setInt(i, 2 * i); columnVecs[6]->set(i, Util::parseConstant(DT_IP, "192.168.1.1")); columnVecs[7]->setInt(i, 3 * i); columnVecs[8]->setLong(i, startTime + timeInc); columnVecs[9]->setLong(i, i + startTime + 100); columnVecs[10]->setInt(i, i); } return table; } //用tableInsert写数据 void *writeData(void *arg) { struct parameter *pParam; pParam = (struct parameter *)arg; TableSP table; for (unsigned int i = 0; i < pParam->nLong; i++) { tableQueue[pParam->index]->pop(table); long long startTime = Util::getEpochTime(); vector<ConstantSP> args; args.push_back(table); conn[pParam->index].run("tableInsert{loadTable('dfs://natlog', `natlogrecords)}", args); pParam->nTime += Util::getEpochTime() - startTime; } printf("Thread %d,insert %ld rows %ld times, used %ld ms.\n", pParam->index, pParam->cLong, pParam->nLong, pParam->nTime); return NULL; } //生成模拟数据 void *genData(void *arg) { struct parameter *pParam; pParam = (struct parameter *)arg; long partitionCount = BUCKETS / pParam->count; for (unsigned int i = 0; i < pParam->nLong; i++) { TableSP table = createDemoTable(pParam->cLong, partitionCount * pParam->index, partitionCount, pParam->nStarttime, i * 5); tableQueue[pParam->index]->push(table); } return NULL; } int main(int argc, char *argv[]) { if (argc < 2) { cout << "No arguments, you MUST give an argument at least!" << endl; showUsage(); return -1; } //解析参数 int nOptionIndex = 1; string cString, nString, hString, pString, tString, sString; stringstream cSS, nSS, pSS, tSS, sSS; long cLong, nLong, pLong, tLong, sLong; vector<string> vHost, vPort; while (nOptionIndex < argc) { if (strncmp(argv[nOptionIndex], "--c=", 4) == 0) { // get records number per threads cString = &argv[nOptionIndex][4]; } else if (strncmp(argv[nOptionIndex], "--h=", 4) == 0) { // get host hString = &argv[nOptionIndex][4]; } else if (strncmp(argv[nOptionIndex], "--p=", 4) == 0) { // get port pString = &argv[nOptionIndex][4]; } else if (strncmp(argv[nOptionIndex], "--n=", 4) == 0) { // get batches nString = &argv[nOptionIndex][4]; } else if (strncmp(argv[nOptionIndex], "--t=", 4) == 0) { // get thread tString = &argv[nOptionIndex][4]; } else if (strncmp(argv[nOptionIndex], "--s=", 4) == 0) { // get start time sString = &argv[nOptionIndex][4]; } else if (strncmp(argv[nOptionIndex], "--help", 6) == 0) { // help showUsage(); return 0; } else { cout << "Options '" << argv[nOptionIndex] << "' not valid. Run '" << argv[0] << "' for details." << endl; return -1; } nOptionIndex++; } if (cString.empty()) { cout << "--c is required" << endl; showUsage(); return -1; } else { cSS << cString; cSS >> cLong; } if (pString.empty()) { cout << "--p is required" << endl; showUsage(); return -1; } else { vPort = Util::split(pString, ','); } if (hString.empty()) { cout << "--h is required" << endl; showUsage(); return -1; } else { vHost = Util::split(hString, ','); } if (nString.empty()) { nLong = 1; } else { nSS << nString; nSS >> nLong; } if (tString.empty()) { tLong = 1; } else { tSS << tString; tSS >> tLong; } if (sString.empty()) { sLong = Util::getEpochTime() / 1000; // 1574380800; cout << "starttime=" << sLong << endl; } else { sSS << sString; sSS >> sLong; } if (tLong > BUCKETS) { cout << "The number of threads must be less than " << BUCKETS << endl; showUsage(); return -1; } if (vHost.size() != vPort.size()) { cout << "The number of host and port must be the same! " << vHost.size() << ":" << vPort.size() << endl; showUsage(); return -1; } try { for (int i = 0; i < tLong; ++i) { hString = vHost[i % vHost.size()]; pLong = std::stol(vPort[i % vPort.size()]); bool ret = conn[i].connect(hString, pLong, "admin", "123456"); if (!ret) { cout << "Failed to connect to the server" << endl; return 0; } tableQueue[i] = new BoundedBlockingQueue<TableSP>(2); } } catch (exception &ex) { cout << "Failed to connect with error: " << ex.what(); return -1; } cout << "Please waiting..." << endl; //启动生成数据线程以及写入线程 long long startTime = Util::getEpochTime(); struct parameter arg[tLong]; std::thread genThreads[tLong]; std::thread writeThreads[tLong]; for (int i = 0; i < tLong; ++i) { arg[i].index = i; arg[i].count = tLong; arg[i].nLong = nLong; arg[i].cLong = cLong; arg[i].nTime = 0; arg[i].nStarttime = sLong; genThreads[i] = std::thread(genData, (void *)&arg[i]); writeThreads[i] = std::thread(writeData, (void *)&arg[i]); } for (int i = 0; i < tLong; ++i) { genThreads[i].join(); writeThreads[i].join(); } //工作完成后,统计耗时 long long endTime = Util::getEpochTime(); long long rowCount = cLong * nLong * tLong; cout << "Inserted " << rowCount << " rows, took a total of " + std::to_string(endTime - startTime) + " ms. " << rowCount / (endTime - startTime) * 1000 / 10000 << " w/s " << endl; long timeSum = arg[0].nTime; for (int i = 1; i < tLong; ++i) { timeSum += arg[i].nTime; } cout << "Total time minus data preparation time: " << std::to_string(timeSum / (double)tLong) + " ms" << endl; return 0; }