26#include "velox/exec/tests/utils/AssertQueryBuilder.h"
27#include "velox/exec/tests/utils/PlanBuilder.h"
28#include "velox/exec/tests/utils/TempDirectoryPath.h"
29#include "BaseFunction.h"
30#include "velox/vector/tests/utils/VectorTestBase.h"
32using namespace facebook::velox;
33using namespace facebook::velox::test;
34using namespace facebook::velox::exec::test;
35using namespace facebook::velox::memory;
52 const SelectivityVector& rows,
53 std::vector<VectorPtr>& args,
55 exec::EvalCtx& context,
56 VectorPtr& output)
const override {
57 BaseVector::ensureWritable(rows, type, context.pool(), output);
59 std::vector<int> results;
61 BaseVector* baseVec = args[0].get();
62 exec::LocalDecodedVector vecHolder(context, *baseVec, rows);
63 auto decodedArray = vecHolder.get();
64 auto inputTimes = decodedArray->base()->as<FlatVector<int64_t>>();
66 const int secondsInADay = 86400;
67 for (
int i = 0; i < rows.size(); i++) {
68 int64_t timestamp = inputTimes->valueAt(i);
70 std::time_t time =
static_cast<std::time_t
>(timestamp);
71 std::tm* time_info = std::localtime(&time);
72 int dayOfWeek = time_info->tm_wday;
75 if (dayOfWeek == 0 || dayOfWeek == 6) {
82 VectorMaker maker{context.pool()};
83 auto localResult = maker.flatVector<
int>(results);
84 context.moveOrCopyResult(localResult, rows, output);
85 output = maker.flatVector<
int>(results, INTEGER());
92 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
93 return {exec::FunctionSignatureBuilder()
94 .argumentType(
"BIGINT")
95 .returnType(
"INTEGER")
121 CostEstimate
getCost(std::vector<int> inputDims) {
123 return CostEstimate(0, inputDims[0], inputDims[1]);
142 const SelectivityVector& rows,
143 std::vector<VectorPtr>& args,
145 exec::EvalCtx& context,
146 VectorPtr& output)
const override {
147 BaseVector::ensureWritable(rows, type, context.pool(), output);
149 std::vector<int> results;
151 BaseVector* baseVec = args[0].get();
152 exec::LocalDecodedVector vecHolder(context, *baseVec, rows);
153 auto decodedArray = vecHolder.get();
154 auto birthYears = decodedArray->base()->as<FlatVector<int>>();
156 auto now = std::chrono::system_clock::now();
157 std::time_t currentTime = std::chrono::system_clock::to_time_t(now);
158 std::tm* localTime = std::localtime(¤tTime);
159 int currentYear = 1900 + localTime->tm_year;
161 for (
int i = 0; i < rows.size(); i++) {
162 int birthYear = birthYears->valueAt(i);
163 results.push_back(currentYear - birthYear);
166 VectorMaker maker{context.pool()};
167 output = maker.flatVector<
int>(results);
174 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
175 return {exec::FunctionSignatureBuilder()
176 .argumentType(
"INTEGER")
177 .returnType(
"INTEGER")
203 CostEstimate
getCost(std::vector<int> inputDims) {
205 return CostEstimate(0, inputDims[0], inputDims[1]);
224 const SelectivityVector& rows,
225 std::vector<VectorPtr>& args,
227 exec::EvalCtx& context,
228 VectorPtr& output)
const override {
229 BaseVector::ensureWritable(rows, type, context.pool(), output);
231 int secondsInADay = 86400;
232 std::vector<std::vector<float>> results;
234 BaseVector* base0 = args[0].get();
235 BaseVector* base1 = args[1].get();
236 BaseVector* base2 = args[2].get();
237 BaseVector* base3 = args[3].get();
239 exec::LocalDecodedVector firstHolder(context, *base0, rows);
240 auto decodedArray0 = firstHolder.get();
242 exec::LocalDecodedVector secondHolder(context, *base1, rows);
243 auto decodedArray1 = secondHolder.get();
245 exec::LocalDecodedVector thirdHolder(context, *base2, rows);
246 auto decodedArray2 = thirdHolder.get();
248 exec::LocalDecodedVector fourthHolder(context, *base3, rows);
249 auto decodedArray3 = fourthHolder.get();
251 for (
int i = 0; i < rows.size(); i++) {
252 float totalOrder = (
static_cast<float>(decodedArray0->valueAt<int64_t>(i))) / 79.0;
253 float tAmount = (decodedArray1->valueAt<
float>(i)) / 16048.0;
254 float timeDiff = (
static_cast<float>(decodedArray2->valueAt<int64_t>(i))) / 729.0;
255 int64_t tTimestamp = decodedArray3->valueAt<int64_t>(i);
258 std::time_t time =
static_cast<std::time_t
>(tTimestamp);
259 std::tm* time_info = std::localtime(&time);
260 float dayOfWeek = (
static_cast<float>(time_info->tm_wday)) / 6.0;
263 float daysSinceEpoch =
264 (
static_cast<float>(tTimestamp / secondsInADay)) / 15338.0;
266 std::vector<float> vec;
267 vec.push_back(totalOrder);
268 vec.push_back(tAmount);
269 vec.push_back(timeDiff);
270 vec.push_back(dayOfWeek);
271 vec.push_back(daysSinceEpoch);
273 results.push_back(vec);
276 VectorMaker maker{context.pool()};
277 output = maker.arrayVector<
float>(results, REAL());
284 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
285 return {exec::FunctionSignatureBuilder()
286 .argumentType(
"BIGINT")
287 .argumentType(
"REAL")
288 .argumentType(
"BIGINT")
289 .argumentType(
"BIGINT")
290 .returnType(
"ARRAY(REAL)")
299 return "get_transaction_features";
316 CostEstimate
getCost(std::vector<int> inputDims) {
318 return CostEstimate(0, inputDims[0], inputDims[1]);
337 const SelectivityVector& rows,
338 std::vector<VectorPtr>& args,
340 exec::EvalCtx& context,
341 VectorPtr& output)
const override {
342 BaseVector::ensureWritable(rows, type, context.pool(), output);
344 int secondsInADay = 86400;
345 std::vector<std::vector<float>> results;
347 BaseVector* base0 = args[0].get();
348 BaseVector* base1 = args[1].get();
349 BaseVector* base2 = args[2].get();
350 BaseVector* base3 = args[3].get();
352 exec::LocalDecodedVector firstHolder(context, *base0, rows);
353 auto decodedArray0 = firstHolder.get();
355 exec::LocalDecodedVector secondHolder(context, *base1, rows);
356 auto decodedArray1 = secondHolder.get();
358 exec::LocalDecodedVector thirdHolder(context, *base2, rows);
359 auto decodedArray2 = thirdHolder.get();
361 exec::LocalDecodedVector fourthHolder(context, *base3, rows);
362 auto decodedArray3 = fourthHolder.get();
364 for (
int i = 0; i < rows.size(); i++) {
366 (
static_cast<float>(decodedArray0->valueAt<
int>(i))) / 35352.0;
367 float cCustFlag =
static_cast<float>(decodedArray1->valueAt<
int>(i));
368 float cBirthCountry =
369 (
static_cast<float>(decodedArray2->valueAt<
int>(i))) / 211.0;
370 float cAge = (
static_cast<float>(decodedArray3->valueAt<
int>(i))) / 94.0;
372 std::vector<float> vec;
373 vec.push_back(cAddressNum);
374 vec.push_back(cCustFlag);
375 vec.push_back(cBirthCountry);
378 results.push_back(vec);
381 VectorMaker maker{context.pool()};
382 output = maker.arrayVector<
float>(results, REAL());
389 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
390 return {exec::FunctionSignatureBuilder()
391 .argumentType(
"INTEGER")
392 .argumentType(
"INTEGER")
393 .argumentType(
"INTEGER")
394 .argumentType(
"INTEGER")
395 .returnType(
"ARRAY(REAL)")
404 return "get_customer_features";
421 CostEstimate
getCost(std::vector<int> inputDims) {
423 return CostEstimate(0, inputDims[0], inputDims[1]);
442 const SelectivityVector& rows,
443 std::vector<VectorPtr>& args,
445 exec::EvalCtx& context,
446 VectorPtr& output)
const override {
447 BaseVector::ensureWritable(rows, type, context.pool(), output);
449 BaseVector* left = args[0].get();
450 BaseVector* right = args[1].get();
452 exec::LocalDecodedVector decodedInput1(context, *args[0], rows);
453 exec::LocalDecodedVector decodedInput2(context, *args[1], rows);
455 std::vector<int64_t> results;
456 int secondsInADay = 86400;
458 for (
int i = 0; i < rows.size(); i++) {
459 if (!rows.isValid(i)) {
462 int64_t timestamp1 = decodedInput1->valueAt<int64_t>(i);
463 int64_t timestamp2 = decodedInput2->valueAt<int64_t>(i);
465 int64_t differenceInSeconds = std::abs(timestamp1 - timestamp2);
466 int64_t differenceInDays = differenceInSeconds / secondsInADay;
467 results.push_back(differenceInDays);
470 VectorMaker maker{context.pool()};
471 auto localResult = maker.flatVector<int64_t>(results);
472 context.moveOrCopyResult(localResult, rows, output);
479 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
480 return {exec::FunctionSignatureBuilder()
481 .argumentType(
"BIGINT")
482 .argumentType(
"BIGINT")
483 .returnType(
"BIGINT")
492 return "time_diff_in_days";
509 CostEstimate
getCost(std::vector<int> inputDims) {
511 return CostEstimate(0, inputDims[0], inputDims[1]);
526 dateFormat = dateFormat_;
538 const SelectivityVector& rows,
539 std::vector<VectorPtr>& args,
541 exec::EvalCtx& context,
542 VectorPtr& output)
const override {
543 BaseVector::ensureWritable(rows, type, context.pool(), output);
545 exec::LocalDecodedVector decodedStringHolder(context, *args[0], rows);
546 auto decodedStringInput = decodedStringHolder.get();
548 std::vector<int64_t> results;
549 struct std::tm t = {};
551 for (
int i = 0; i < rows.size(); i++) {
552 StringView val = decodedStringInput->valueAt<StringView>(i);
553 std::string inputStr = std::string(val);
555 std::istringstream ss(inputStr);
556 ss >> std::get_time(&t, dateFormat);
560 std::cerr <<
"Failed to parse date string " << inputStr << std::endl;
561 results.push_back(0);
566 time_t tt = mktime(&t);
568 int64_t timestamp =
static_cast<int64_t
>(tt);
569 results.push_back(timestamp);
572 VectorMaker maker{context.pool()};
573 output = maker.flatVector<int64_t>(results, BIGINT());
580 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
581 return {exec::FunctionSignatureBuilder()
582 .argumentType(
"VARCHAR")
583 .returnType(
"BIGINT")
592 return "date_to_timestamp";
609 CostEstimate
getCost(std::vector<int> inputDims) {
611 return CostEstimate(0, inputDims[0], inputDims[1]);
615 const char* dateFormat;
633 const SelectivityVector& rows,
634 std::vector<VectorPtr>& args,
636 exec::EvalCtx& context,
637 VectorPtr& output)
const override {
638 BaseVector::ensureWritable(rows, type, context.pool(), output);
640 std::vector<int> results;
642 BaseVector* baseVec = args[0].get();
643 exec::LocalDecodedVector vecHolder(context, *baseVec, rows);
644 auto decodedArray = vecHolder.get();
645 auto inputProbs = decodedArray->base()->as<ArrayVector>();
646 auto inputProbsValues = inputProbs->elements()->asFlatVector<
float>();
648 for (
int i = 0; i < rows.size(); i++) {
649 int32_t offset = inputProbs->offsetAt(i);
650 float prob_0 = inputProbsValues->valueAt(offset);
651 float prob_1 = inputProbsValues->valueAt(offset + 1);
652 if (std::isnan(prob_0) || std::isnan(prob_1)) {
653 results.push_back(0);
655 int predicted_class = (prob_0 > prob_1) ? 0 : 1;
656 results.push_back(predicted_class);
660 VectorMaker maker{context.pool()};
661 output = maker.flatVector<
int>(results);
668 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
669 return {exec::FunctionSignatureBuilder()
670 .argumentType(
"ARRAY(REAL)")
671 .returnType(
"INTEGER")
680 return "get_binary_class";
697 CostEstimate
getCost(std::vector<int> inputDims) {
699 return CostEstimate(0, inputDims[0], inputDims[1]);
708 std::unordered_map<std::string, int> countryMap;
711 std::string filePath =
"/home/velox/resources/data/country_mapping.txt";
712 std::ifstream file(filePath.c_str());
713 if (!file.is_open()) {
714 std::cerr <<
"Error: Could not open the file!" << std::endl;
720 while (std::getline(file, line)) {
721 std::stringstream ss(line);
723 std::string value_str;
726 std::getline(ss, key,
',');
729 std::getline(ss, value_str);
732 int value = std::stoi(value_str);
735 countryMap[key] = value;
std::unordered_map< std::string, int > getCountryMap()
Reads a file containing country mappings and returns an unordered_map.
Definition FraudDetectionFunctions.h:707
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition FraudDetectionFunctions.h:580
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the function to convert a date string to a timestamp.
Definition FraudDetectionFunctions.h:537
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition FraudDetectionFunctions.h:609
static std::string getName()
Returns the name of the function.
Definition FraudDetectionFunctions.h:591
DateToTimestamp(const char *dateFormat_)
Constructor for DateToTimestamp.
Definition FraudDetectionFunctions.h:525
float * getTensor() const override
Returns the tensor associated with the function.
Definition FraudDetectionFunctions.h:599
Implements a function to calculate the age based on the birth year.
Definition FraudDetectionFunctions.h:131
float * getTensor() const override
Returns the tensor associated with the function.
Definition FraudDetectionFunctions.h:193
static std::string getName()
Returns the name of the function.
Definition FraudDetectionFunctions.h:185
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition FraudDetectionFunctions.h:174
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition FraudDetectionFunctions.h:203
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the function to calculate the age.
Definition FraudDetectionFunctions.h:141
Implements a function to determine the binary class based on probabilities.
Definition FraudDetectionFunctions.h:622
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the function to determine the binary class.
Definition FraudDetectionFunctions.h:632
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition FraudDetectionFunctions.h:668
float * getTensor() const override
Returns the tensor associated with the function.
Definition FraudDetectionFunctions.h:687
static std::string getName()
Returns the name of the function.
Definition FraudDetectionFunctions.h:679
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition FraudDetectionFunctions.h:697
Implements a function to extract features from customer data.
Definition FraudDetectionFunctions.h:326
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition FraudDetectionFunctions.h:421
static std::string getName()
Returns the name of the function.
Definition FraudDetectionFunctions.h:403
float * getTensor() const override
Returns the tensor associated with the function.
Definition FraudDetectionFunctions.h:411
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition FraudDetectionFunctions.h:389
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the function to extract customer features.
Definition FraudDetectionFunctions.h:336
Implements a function to extract features from transaction data.
Definition FraudDetectionFunctions.h:213
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition FraudDetectionFunctions.h:284
float * getTensor() const override
Returns the tensor associated with the function.
Definition FraudDetectionFunctions.h:306
static std::string getName()
Returns the name of the function.
Definition FraudDetectionFunctions.h:298
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the function to extract transaction features.
Definition FraudDetectionFunctions.h:223
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition FraudDetectionFunctions.h:316
Implements a function to check if a given timestamp corresponds to a weekday.
Definition FraudDetectionFunctions.h:41
static std::string getName()
Returns the name of the function.
Definition FraudDetectionFunctions.h:103
float * getTensor() const override
Returns the tensor associated with the function.
Definition FraudDetectionFunctions.h:111
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition FraudDetectionFunctions.h:92
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition FraudDetectionFunctions.h:121
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the function to check if the timestamp is a weekday.
Definition FraudDetectionFunctions.h:51
A base class for machine learning functions, inheriting from Velox's VectorFunction.
Definition BaseFunction.h:9
Implements a function to calculate the difference in days between two timestamps.
Definition FraudDetectionFunctions.h:431
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition FraudDetectionFunctions.h:479
float * getTensor() const override
Returns the tensor associated with the function.
Definition FraudDetectionFunctions.h:499
static std::string getName()
Returns the name of the function.
Definition FraudDetectionFunctions.h:491
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the function to calculate the time difference in days.
Definition FraudDetectionFunctions.h:441
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition FraudDetectionFunctions.h:509