18#include <fmt/format.h>
20#include "BaseFunction.h"
21#include "velox/exec/tests/utils/AssertQueryBuilder.h"
22#include "velox/exec/tests/utils/PlanBuilder.h"
23#include "velox/exec/tests/utils/TempDirectoryPath.h"
24#include "velox/vector/tests/utils/VectorTestBase.h"
26using namespace facebook::velox;
27using namespace facebook::velox::test;
28using namespace facebook::velox::exec::test;
29using namespace facebook::velox::memory;
54 const SelectivityVector& rows,
55 std::vector<VectorPtr>& args,
57 exec::EvalCtx& context,
58 VectorPtr& output)
const override {
59 BaseVector::ensureWritable(rows, ARRAY(INTEGER()), context.pool(), output);
62 auto arrayVector = args[0]->as<ArrayVector>();
63 auto elementsVector = arrayVector->elements()->asFlatVector<
int>();
66 auto numInputs = rows.size();
67 std::vector<std::vector<int>> result(numInputs);
70 rows.applyToSelected([&](
int row) {
72 auto userIdBeforeEncode = elementsVector->valueAt(row);
75 auto it = mapping_.find(userIdBeforeEncode);
76 if (it != mapping_.end()) {
78 result[row] = {it->second};
82 LOG(WARNING) <<
"[WARNING] Missing key: " << userIdBeforeEncode
83 <<
" mapping size: " << mapping_.size() << std::endl;
87 VectorMaker maker{context.pool()};
88 output = maker.arrayVector<
int>(result, INTEGER());
95 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
96 return {exec::FunctionSignatureBuilder()
97 .argumentType(
"array(INTEGER)")
98 .returnType(
"array(INTEGER)")
124 CostEstimate
getCost(std::vector<int> inputDims) {
126 return CostEstimate(0, inputDims[0], inputDims[1]);
130 std::unordered_map<int, int> mapping_;
156 const SelectivityVector& rows,
157 std::vector<VectorPtr>& args,
159 exec::EvalCtx& context,
160 VectorPtr& output)
const override {
161 BaseVector::ensureWritable(rows, type, context.pool(), output);
164 exec::LocalDecodedVector decodedStringHolder(context, *args[0], rows);
165 auto decodedStringInput = decodedStringHolder.get();
166 int numInputs = rows.size();
168 std::vector<std::vector<int>> result(numInputs);
170 rows.applyToSelected([&](
int row) {
171 StringView val = decodedStringInput->valueAt<StringView>(row);
172 auto it = mapping_.find(val.getString());
173 if (it != mapping_.end()) {
174 result[row] = {it->second};
178 LOG(WARNING) <<
"[WARNING] Missing key: " << val.getString()
183 VectorMaker maker{context.pool()};
184 output = maker.arrayVector<
int>(result, INTEGER());
191 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
192 return {exec::FunctionSignatureBuilder()
193 .argumentType(
"VARCHAR")
194 .returnType(
"array(INTEGER)")
203 return "encoder_string";
220 CostEstimate
getCost(std::vector<int> inputDims) {
222 return CostEstimate(0, inputDims[0], inputDims[1]);
226 std::unordered_map<std::string, int> mapping_;
240 mapping_ = std::unordered_map<std::string, int>(mapping);
252 const SelectivityVector& rows,
253 std::vector<VectorPtr>& args,
255 exec::EvalCtx& context,
256 VectorPtr& output)
const override {
257 BaseVector::ensureWritable(rows, type, context.pool(), output);
259 auto arrayVector = args[0]->as<ArrayVector>();
260 auto elementsVector = arrayVector->elements()->asFlatVector<StringView>();
261 auto numRows = rows.size();
263 std::vector<std::vector<int>> result(numRows);
265 rows.applyToSelected([&](vector_size_t row) {
266 int numElements = arrayVector->sizeAt(row);
267 int offset = arrayVector->offsetAt(row);
269 std::vector<int> indices;
270 indices.reserve(numElements);
272 for (
int j = 0; j < numElements; ++j) {
274 StringView val = elementsVector->valueAt(offset + j);
276 auto it = mapping_.find(val.getString());
277 if (it != mapping_.end()) {
278 indices.push_back(it->second);
281 indices.push_back(0);
282 std::cout <<
"[ERROR] Missing key: " << val.getString() << std::endl;
285 result[row] = indices;
288 VectorMaker maker{context.pool()};
289 output = maker.arrayVector<
int>(result, INTEGER());
296 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
297 return {exec::FunctionSignatureBuilder()
298 .argumentType(
"array(VARCHAR)")
299 .returnType(
"array(INTEGER)")
308 return "encoder_string_variadic";
325 CostEstimate
getCost(std::vector<int> inputDims) {
327 return CostEstimate(0, inputDims[0], inputDims[1]);
331 std::unordered_map<std::string, int> mapping_;
357 const SelectivityVector& rows,
358 std::vector<VectorPtr>& args,
360 exec::EvalCtx& context,
361 VectorPtr& output)
const override {
362 BaseVector::ensureWritable(rows, type, context.pool(), output);
364 auto indicesRowVector = args[0];
365 auto arrayVector = indicesRowVector->as<ArrayVector>();
367 auto indicesVector = arrayVector->elements();
368 int* indicesValues = indicesVector->values()->asMutable<
int>();
369 int numInputs = rows.size();
371 std::vector<std::vector<float>> encoding(
372 numInputs, std::vector<float>(size_, 0));
374 for (
int i = 0; i < numInputs; i++) {
375 int numSubIndices = arrayVector->sizeAt(i);
376 int indicesOffset = arrayVector->offsetAt(i);
377 float value = 1.0 / numSubIndices;
378 for (
int j = 0; j < numSubIndices; j++) {
379 int embedIndex = indicesValues[indicesOffset + j];
380 encoding[i][embedIndex] = value;
384 VectorMaker maker{context.pool()};
385 output = maker.arrayVector<
float>(encoding, REAL());
392 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
393 return {exec::FunctionSignatureBuilder()
394 .argumentType(
"array(INTEGER)")
395 .returnType(
"array(REAL)")
404 return "multi_hot_norm_encoder";
421 CostEstimate
getCost(std::vector<int> inputDims) {
423 return CostEstimate(0, inputDims[0], inputDims[1]);
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:95
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:124
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:53
IntEncoder(std::unordered_map< int, int > mapping)
Constructor for IntEncoder.
Definition Encoder.h:41
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:114
static std::string getName()
Returns the name of the function.
Definition Encoder.h:106
A base class for machine learning functions, inheriting from Velox's VectorFunction.
Definition BaseFunction.h:9
MultiHotNormalizedEncoder(int size)
Constructor for MultiHotNormalizedEncoder.
Definition Encoder.h:344
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:411
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:421
static std::string getName()
Returns the name of the function.
Definition Encoder.h:403
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:356
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:392
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:210
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:191
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:155
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:220
StringEncoder(std::unordered_map< std::string, int > mapping)
Constructor for StringEncoder.
Definition Encoder.h:143
static std::string getName()
Returns the name of the function.
Definition Encoder.h:202
StringVariadicEncoder(std::unordered_map< std::string, int > mapping)
Constructor for StringVariadicEncoder.
Definition Encoder.h:239
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:325
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:251
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:296
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:315
static std::string getName()
Returns the name of the function.
Definition Encoder.h:307