From 78e3647d8889d0f85f5e41ce39f65487da4e1dba Mon Sep 17 00:00:00 2001 From: chen-guang-wang <18767185082@163.com> Date: Mon, 12 Sep 2022 10:33:05 +0800 Subject: [PATCH] tableScan support short && boolean && double --- .../cpp/src/jni/OrcColumnarBatchJniReader.cpp | 48 ++++--- .../cpp/src/jni/OrcColumnarBatchJniReader.h | 2 +- .../cpp/test/tablescan/scan_test.cpp | 119 +++++++++++++----- .../spark/jni/OrcColumnarBatchJniReader.java | 28 +++-- 4 files changed, 140 insertions(+), 57 deletions(-) diff --git a/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.cpp b/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.cpp index 3b38f0dea..492a86dbb 100644 --- a/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.cpp +++ b/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.cpp @@ -350,33 +350,51 @@ uint64_t copyVarwidth(int maxLen, orc::ColumnVectorBatch *field, int vcType) return (uint64_t)originalVector; } -int copyToOminVec(int maxLen, int vcType, int &ominTypeId, uint64_t &ominVecId, orc::ColumnVectorBatch *field) +int copyToOmniVec(orc::TypeKind vcType, int &omniTypeId, uint64_t &omniVecId, orc::ColumnVectorBatch *field, ...) { switch (vcType) { - case orc::TypeKind::DATE: + case orc::TypeKind::BOOLEAN: { + omniTypeId = static_cast(OMNI_BOOLEAN); + omniVecId = copyFixwidth(field); + break; + } + case orc::TypeKind::SHORT: { + omniTypeId = static_cast(OMNI_SHORT); + omniVecId = copyFixwidth(field); + break; + } + case orc::TypeKind::DATE: { + omniTypeId = static_cast(OMNI_DATE32); + omniVecId = copyFixwidth(field); + break; + } case orc::TypeKind::INT: { - if (vcType == orc::TypeKind::DATE) { - ominTypeId = static_cast(OMNI_DATE32); - } else { - ominTypeId = static_cast(OMNI_INT); - } - ominVecId = copyFixwidth(field); + omniTypeId = static_cast(OMNI_INT); + omniVecId = copyFixwidth(field); break; } case orc::TypeKind::LONG: { - ominTypeId = static_cast(OMNI_LONG); - ominVecId = copyFixwidth(field); + omniTypeId = static_cast(OMNI_LONG); + omniVecId = copyFixwidth(field); + break; + } + case orc::TypeKind::DOUBLE: { + omniTypeId = static_cast(OMNI_DOUBLE); + omniVecId = copyFixwidth(field); break; } case orc::TypeKind::CHAR: case orc::TypeKind::STRING: case orc::TypeKind::VARCHAR: { - ominTypeId = static_cast(OMNI_VARCHAR); - ominVecId = (uint64_t)copyVarwidth(maxLen, field, vcType); + omniTypeId = static_cast(OMNI_VARCHAR); + va_list args; + va_start(args, field); + omniVecId = (uint64_t)copyVarwidth(va_arg(args, int), field, vcType); + va_end(args); break; } default: { - LogsError("orc::TypeKind::UNKNOWN ERROR %d", vcType); + throw std::runtime_error("Native ColumnarFileScan Not support For This Type: " + vcType); } } return 1; @@ -442,13 +460,13 @@ JNIEXPORT jlong JNICALL Java_com_huawei_boostkit_spark_jni_OrcColumnarBatchJniRe vecCnt = root->fields.size(); batchRowSize = root->fields[0]->numElements; for (int id = 0; id < vecCnt; id++) { - int vcType = baseTp.getSubtype(id)->getKind(); + orc::TypeKind vcType = baseTp.getSubtype(id)->getKind(); int maxLen = baseTp.getSubtype(id)->getMaximumLength(); int ominTypeId = 0; uint64_t ominVecId = 0; try { if (vcType != orc::TypeKind::DECIMAL) { - copyToOminVec(maxLen, vcType, ominTypeId, ominVecId, root->fields[id]); + copyToOmniVec(vcType, ominTypeId, ominVecId, root->fields[id], maxLen); } else { copyToOminDecimalVec(baseTp.getSubtype(id)->getPrecision(), ominTypeId, ominVecId, root->fields[id]); diff --git a/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.h b/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.h index 5d05f7347..bdb90301e 100644 --- a/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.h +++ b/omnioperator/omniop-spark-extension/cpp/src/jni/OrcColumnarBatchJniReader.h @@ -140,7 +140,7 @@ int getLiteral(orc::Literal &lit, int leafType, std::string value); int buildLeafs(int leafOp, std::vector &litList, orc::Literal &lit, std::string leafNameString, int leafType, orc::SearchArgumentBuilder &builder); -int copyToOminVec(int maxLen, int vcType, int &ominTypeId, uint64_t &ominVecId, orc::ColumnVectorBatch *field); +int copyToOmniVec(orc::TypeKind vcType, int &omniTypeId, uint64_t &omniVecId, orc::ColumnVectorBatch *field, ...); #ifdef __cplusplus } diff --git a/omnioperator/omniop-spark-extension/cpp/test/tablescan/scan_test.cpp b/omnioperator/omniop-spark-extension/cpp/test/tablescan/scan_test.cpp index 3cfff6a9f..121985787 100644 --- a/omnioperator/omniop-spark-extension/cpp/test/tablescan/scan_test.cpp +++ b/omnioperator/omniop-spark-extension/cpp/test/tablescan/scan_test.cpp @@ -26,6 +26,7 @@ static std::string filename = "/resources/orc_data_all_type"; static orc::ColumnVectorBatch *batchPtr; +static orc::StructVectorBatch *root; /* * CREATE TABLE `orc_test` ( `c1` int, `c2` varChar(60), `c3` string, `c4` bigint, @@ -56,6 +57,7 @@ protected: std::unique_ptr batch = rowReader->createRowBatch(4096); rowReader->next(*batch); batchPtr = batch.release(); + root = static_cast(batchPtr); } // run after each case... @@ -69,65 +71,122 @@ TEST_F(ScanTest, test_get_literal) orc::Literal tmpLit(0L); // test get long getLiteral(tmpLit, 0, "123456789"); - ASSERT_EQ(tmpLit.toString() == "123456789", true); + ASSERT_EQ(tmpLit.toString(), "123456789"); // test get string getLiteral(tmpLit, 2, "testStringForLit"); - ASSERT_EQ(tmpLit.toString() == "testStringForLit", true); + ASSERT_EQ(tmpLit.toString(), "testStringForLit"); // test get date getLiteral(tmpLit, 3, "987654321"); - ASSERT_EQ(tmpLit.toString() == "987654321", true); + ASSERT_EQ(tmpLit.toString(), "987654321"); } -TEST_F(ScanTest, test_copy_vec) +TEST_F(ScanTest, test_copy_intVec) { - orc::StructVectorBatch *root = static_cast(batchPtr); int omniType = 0; - uint64_t ominVecId = 0; + uint64_t omniVecId = 0; // int type - copyToOminVec(0, 3, omniType, ominVecId, root->fields[0]); - ASSERT_EQ(omniType == 1, true); - omniruntime::vec::IntVector *olbInt = (omniruntime::vec::IntVector *)(ominVecId); - ASSERT_EQ(olbInt->GetValue(0) == 10, true); + copyToOmniVec(orc::TypeKind::INT, omniType, omniVecId, root->fields[0]); + ASSERT_EQ(omniType, omniruntime::type::OMNI_INT); + omniruntime::vec::IntVector *olbInt = (omniruntime::vec::IntVector *)(omniVecId); + ASSERT_EQ(olbInt->GetValue(0), 10); delete olbInt; +} +TEST_F(ScanTest, test_copy_varCharVec) +{ + int omniType = 0; + uint64_t omniVecId = 0; // varchar type - copyToOminVec(60, 16, omniType, ominVecId, root->fields[1]); - ASSERT_EQ(omniType == 15, true); + copyToOmniVec(orc::TypeKind::VARCHAR, omniType, omniVecId, root->fields[1], 60); + ASSERT_EQ(omniType, omniruntime::type::OMNI_VARCHAR); uint8_t *actualChar = nullptr; - omniruntime::vec::VarcharVector * olbVc = (omniruntime::vec::VarcharVector *)(ominVecId); - int len = olbVc->GetValue(0, &actualChar); + omniruntime::vec::VarcharVector *olbVc = (omniruntime::vec::VarcharVector *)(omniVecId); + int len = olbVc->GetValue(0, &actualChar); std::string actualStr(reinterpret_cast(actualChar), 0, len); - ASSERT_EQ(actualStr == "varchar_1", true); + ASSERT_EQ(actualStr, "varchar_1"); delete olbVc; +} +TEST_F(ScanTest, test_copy_stringVec) +{ + int omniType = 0; + uint64_t omniVecId = 0; + uint8_t *actualChar = nullptr; // string type - copyToOminVec(0, 7, omniType, ominVecId, root->fields[2]); - ASSERT_EQ(omniType == 15, true); - omniruntime::vec::VarcharVector *olbStr = (omniruntime::vec::VarcharVector *)(ominVecId); - len = olbStr->GetValue(0, &actualChar); + copyToOmniVec(orc::TypeKind::STRING, omniType, omniVecId, root->fields[2]); + ASSERT_EQ(omniType, omniruntime::type::OMNI_VARCHAR); + omniruntime::vec::VarcharVector *olbStr = (omniruntime::vec::VarcharVector *)(omniVecId); + int len = olbStr->GetValue(0, &actualChar); std::string actualStr2(reinterpret_cast(actualChar), 0, len); - ASSERT_EQ(actualStr2 == "string_type_1", true); + ASSERT_EQ(actualStr2, "string_type_1"); delete olbStr; +} +TEST_F(ScanTest, test_copy_longVec) +{ + int omniType = 0; + uint64_t omniVecId = 0; // bigint type - copyToOminVec(0, 4, omniType, ominVecId, root->fields[3]); - ASSERT_EQ(omniType == 2, true); - omniruntime::vec::LongVector *olbLong = (omniruntime::vec::LongVector *)(ominVecId); - ASSERT_EQ(olbLong->GetValue(0) == 10000, true); + copyToOmniVec(orc::TypeKind::LONG, omniType, omniVecId, root->fields[3]); + ASSERT_EQ(omniType, omniruntime::type::OMNI_LONG); + omniruntime::vec::LongVector *olbLong = (omniruntime::vec::LongVector *)(omniVecId); + ASSERT_EQ(olbLong->GetValue(0), 10000); delete olbLong; +} +TEST_F(ScanTest, test_copy_charVec) +{ + int omniType = 0; + uint64_t omniVecId = 0; + uint8_t *actualChar = nullptr; // char type - copyToOminVec(40, 17, omniType, ominVecId, root->fields[4]); - ASSERT_EQ(omniType == 15, true); - omniruntime::vec::VarcharVector *olbChar40 = (omniruntime::vec::VarcharVector *)(ominVecId); - len = olbChar40->GetValue(0, &actualChar); + copyToOmniVec(orc::TypeKind::CHAR, omniType, omniVecId, root->fields[4], 40); + ASSERT_EQ(omniType, omniruntime::type::OMNI_VARCHAR); + omniruntime::vec::VarcharVector *olbChar40 = (omniruntime::vec::VarcharVector *)(omniVecId); + int len = olbChar40->GetValue(0, &actualChar); std::string actualStr3(reinterpret_cast(actualChar), 0, len); - ASSERT_EQ(actualStr3 == "char_1", true); + ASSERT_EQ(actualStr3, "char_1"); delete olbChar40; } +TEST_F(ScanTest, test_copy_doubleVec) +{ + int omniType = 0; + uint64_t omniVecId = 0; + // double type + copyToOmniVec(orc::TypeKind::DOUBLE, omniType, omniVecId, root->fields[6]); + ASSERT_EQ(omniType, omniruntime::type::OMNI_DOUBLE); + omniruntime::vec::DoubleVector *olbDouble = (omniruntime::vec::DoubleVector *)(omniVecId); + ASSERT_EQ(olbDouble->GetValue(0), 1111.1111); + delete olbDouble; +} + +TEST_F(ScanTest, test_copy_booleanVec) +{ + int omniType = 0; + uint64_t omniVecId = 0; + // boolean type + copyToOmniVec(orc::TypeKind::BOOLEAN, omniType, omniVecId, root->fields[9]); + ASSERT_EQ(omniType, omniruntime::type::OMNI_BOOLEAN); + omniruntime::vec::BooleanVector *olbBoolean = (omniruntime::vec::BooleanVector *)(omniVecId); + ASSERT_EQ(olbBoolean->GetValue(0), true); + delete olbBoolean; +} + +TEST_F(ScanTest, test_copy_shortVec) +{ + int omniType = 0; + uint64_t omniVecId = 0; + // short type + copyToOmniVec(orc::TypeKind::SHORT, omniType, omniVecId, root->fields[10]); + ASSERT_EQ(omniType, omniruntime::type::OMNI_SHORT); + omniruntime::vec::ShortVector *olbShort = (omniruntime::vec::ShortVector *)(omniVecId); + ASSERT_EQ(olbShort->GetValue(0), 11); + delete olbShort; +} + TEST_F(ScanTest, test_build_leafs) { int leafOp = 0; @@ -159,5 +218,5 @@ TEST_F(ScanTest, test_build_leafs) "leaf-0 = (leaf-0 = 100), leaf-1 = (leaf-1 < 100), leaf-2 = (leaf-1 <= 100), leaf-3 = (leaf-1 null_safe_= " "100), leaf-4 = (leaf-1 is null), expr = (and leaf-0 leaf-1 leaf-2 leaf-3 leaf-4)"; - ASSERT_EQ(buildString == result, true); + ASSERT_EQ(buildString, result); } diff --git a/omnioperator/omniop-spark-extension/java/src/main/java/com/huawei/boostkit/spark/jni/OrcColumnarBatchJniReader.java b/omnioperator/omniop-spark-extension/java/src/main/java/com/huawei/boostkit/spark/jni/OrcColumnarBatchJniReader.java index 22707a88e..555c1db50 100644 --- a/omnioperator/omniop-spark-extension/java/src/main/java/com/huawei/boostkit/spark/jni/OrcColumnarBatchJniReader.java +++ b/omnioperator/omniop-spark-extension/java/src/main/java/com/huawei/boostkit/spark/jni/OrcColumnarBatchJniReader.java @@ -21,11 +21,7 @@ package com.huawei.boostkit.spark.jni; import nova.hetu.omniruntime.type.DataType; import nova.hetu.omniruntime.type.Decimal64DataType; import nova.hetu.omniruntime.type.Decimal128DataType; -import nova.hetu.omniruntime.vector.IntVec; -import nova.hetu.omniruntime.vector.LongVec; -import nova.hetu.omniruntime.vector.VarcharVec; -import nova.hetu.omniruntime.vector.Decimal128Vec; -import nova.hetu.omniruntime.vector.Vec; +import nova.hetu.omniruntime.vector.*; import org.apache.hadoop.hive.ql.io.sarg.ExpressionTree; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; @@ -231,15 +227,28 @@ public class OrcColumnarBatchJniReader { continue; } switch (DataType.DataTypeId.values()[typeIds[nativeGetId]]) { + case OMNI_BOOLEAN: { + vecList[i] = new BooleanVec(vecNativeIds[nativeGetId]); + break; + } + case OMNI_SHORT: { + vecList[i] = new ShortVec(vecNativeIds[nativeGetId]); + break; + } case OMNI_DATE32: case OMNI_INT: { vecList[i] = new IntVec(vecNativeIds[nativeGetId]); break; } - case OMNI_LONG: { + case OMNI_LONG: + case OMNI_DECIMAL64: { vecList[i] = new LongVec(vecNativeIds[nativeGetId]); break; } + case OMNI_DOUBLE: { + vecList[i] = new DoubleVec(vecNativeIds[nativeGetId]); + break; + } case OMNI_VARCHAR: { vecList[i] = new VarcharVec(vecNativeIds[nativeGetId]); break; @@ -248,12 +257,9 @@ public class OrcColumnarBatchJniReader { vecList[i] = new Decimal128Vec(vecNativeIds[nativeGetId], Decimal128DataType.DECIMAL128); break; } - case OMNI_DECIMAL64: { - vecList[i] = new LongVec(vecNativeIds[nativeGetId]); - break; - } default: { - LOGGER.error("UNKNOWN TYPE ERROR IN JAVA" + DataType.DataTypeId.values()[typeIds[i]]); + throw new RuntimeException("UnSupport type for ColumnarFileScan:" + + DataType.DataTypeId.values()[typeIds[i]]); } } nativeGetId++; -- Gitee