ML functions
 
Loading...
Searching...
No Matches
Encoder.h
Go to the documentation of this file.
1
15
16#pragma once
17
18#include <fmt/format.h>
19#include <iostream>
20#include "BaseFunction.h"
21#include "velox/exec/tests/utils/AssertQueryBuilder.h"
22#include "velox/exec/tests/utils/PlanBuilder.h"
23#include "velox/exec/tests/utils/TempDirectoryPath.h"
24#include "velox/vector/tests/utils/VectorTestBase.h"
25
26using namespace facebook::velox;
27using namespace facebook::velox::test;
28using namespace facebook::velox::exec::test;
29using namespace facebook::velox::memory;
30
35class IntEncoder : public MLFunction {
36 public:
41 IntEncoder(std::unordered_map<int, int> mapping) {
42 mapping_ = mapping;
43 }
44
53 void apply(
54 const SelectivityVector& rows,
55 std::vector<VectorPtr>& args,
56 const TypePtr& type,
57 exec::EvalCtx& context,
58 VectorPtr& output) const override {
59 BaseVector::ensureWritable(rows, ARRAY(INTEGER()), context.pool(), output);
60
61 // Decode the input argument.
62 auto arrayVector = args[0]->as<ArrayVector>();
63 auto elementsVector = arrayVector->elements()->asFlatVector<int>();
64
65 // Map to store result rows.
66 auto numInputs = rows.size();
67 std::vector<std::vector<int>> result(numInputs);
68
69 // Process only the selected rows.
70 rows.applyToSelected([&](int row) {
71 // Decode the array element for this row.
72 auto userIdBeforeEncode = elementsVector->valueAt(row);
73
74 // Check if the userId exists in the mapping.
75 auto it = mapping_.find(userIdBeforeEncode);
76 if (it != mapping_.end()) {
77 // If found, set the result.
78 result[row] = {it->second};
79 } else {
80 // Handle missing keys if necessary.
81 result[row] = {-1};
82 LOG(WARNING) << "[WARNING] Missing key: " << userIdBeforeEncode
83 << " mapping size: " << mapping_.size() << std::endl;
84 }
85 });
86
87 VectorMaker maker{context.pool()};
88 output = maker.arrayVector<int>(result, INTEGER());
89 }
90
95 static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
96 return {exec::FunctionSignatureBuilder()
97 .argumentType("array(INTEGER)")
98 .returnType("array(INTEGER)")
99 .build()};
100 }
101
106 static std::string getName() {
107 return "encoder";
108 };
109
114 float* getTensor() const override {
115 // TODO: need to implement
116 return nullptr;
117 }
118
124 CostEstimate getCost(std::vector<int> inputDims) {
125 // TODO: need to implement
126 return CostEstimate(0, inputDims[0], inputDims[1]);
127 }
128
129 private:
130 std::unordered_map<int, int> mapping_;
131};
132
137class StringEncoder : public MLFunction {
138 public:
143 StringEncoder(std::unordered_map<std::string, int> mapping) {
144 mapping_ = mapping;
145 }
146
155 void apply(
156 const SelectivityVector& rows,
157 std::vector<VectorPtr>& args,
158 const TypePtr& type,
159 exec::EvalCtx& context,
160 VectorPtr& output) const override {
161 BaseVector::ensureWritable(rows, type, context.pool(), output);
162
163 // Read string input
164 exec::LocalDecodedVector decodedStringHolder(context, *args[0], rows);
165 auto decodedStringInput = decodedStringHolder.get();
166 int numInputs = rows.size();
167
168 std::vector<std::vector<int>> result(numInputs);
169
170 rows.applyToSelected([&](int row) {
171 StringView val = decodedStringInput->valueAt<StringView>(row);
172 auto it = mapping_.find(val.getString());
173 if (it != mapping_.end()) {
174 result[row] = {it->second};
175 } else {
176 // Handle missing keys if necessary
177 result[row] = {-1};
178 LOG(WARNING) << "[WARNING] Missing key: " << val.getString()
179 << std::endl;
180 }
181 });
182
183 VectorMaker maker{context.pool()};
184 output = maker.arrayVector<int>(result, INTEGER());
185 }
186
191 static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
192 return {exec::FunctionSignatureBuilder()
193 .argumentType("VARCHAR")
194 .returnType("array(INTEGER)")
195 .build()};
196 }
197
202 static std::string getName() {
203 return "encoder_string";
204 };
205
210 float* getTensor() const override {
211 // TODO: need to implement
212 return nullptr;
213 }
214
220 CostEstimate getCost(std::vector<int> inputDims) {
221 // TODO: need to implement
222 return CostEstimate(0, inputDims[0], inputDims[1]);
223 }
224
225 private:
226 std::unordered_map<std::string, int> mapping_;
227};
228
234 public:
239 StringVariadicEncoder(std::unordered_map<std::string, int> mapping) {
240 mapping_ = std::unordered_map<std::string, int>(mapping);
241 }
242
251 void apply(
252 const SelectivityVector& rows,
253 std::vector<VectorPtr>& args,
254 const TypePtr& type,
255 exec::EvalCtx& context,
256 VectorPtr& output) const override {
257 BaseVector::ensureWritable(rows, type, context.pool(), output);
258
259 auto arrayVector = args[0]->as<ArrayVector>();
260 auto elementsVector = arrayVector->elements()->asFlatVector<StringView>();
261 auto numRows = rows.size();
262
263 std::vector<std::vector<int>> result(numRows);
264
265 rows.applyToSelected([&](vector_size_t row) {
266 int numElements = arrayVector->sizeAt(row);
267 int offset = arrayVector->offsetAt(row);
268
269 std::vector<int> indices;
270 indices.reserve(numElements);
271
272 for (int j = 0; j < numElements; ++j) {
273 // Safely decode each string
274 StringView val = elementsVector->valueAt(offset + j);
275
276 auto it = mapping_.find(val.getString());
277 if (it != mapping_.end()) {
278 indices.push_back(it->second);
279 } else {
280 // Handle missing keys if necessary
281 indices.push_back(0); // Or some default value
282 std::cout << "[ERROR] Missing key: " << val.getString() << std::endl;
283 }
284 }
285 result[row] = indices;
286 });
287
288 VectorMaker maker{context.pool()};
289 output = maker.arrayVector<int>(result, INTEGER());
290 }
291
296 static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
297 return {exec::FunctionSignatureBuilder()
298 .argumentType("array(VARCHAR)")
299 .returnType("array(INTEGER)")
300 .build()};
301 }
302
307 static std::string getName() {
308 return "encoder_string_variadic";
309 };
310
315 float* getTensor() const override {
316 // TODO: need to implement
317 return nullptr;
318 }
319
325 CostEstimate getCost(std::vector<int> inputDims) {
326 // TODO: need to implement
327 return CostEstimate(0, inputDims[0], inputDims[1]);
328 }
329
330 private:
331 std::unordered_map<std::string, int> mapping_;
332};
333
339 public:
345 size_ = size;
346 }
347
356 void apply(
357 const SelectivityVector& rows,
358 std::vector<VectorPtr>& args,
359 const TypePtr& type,
360 exec::EvalCtx& context,
361 VectorPtr& output) const override {
362 BaseVector::ensureWritable(rows, type, context.pool(), output);
363
364 auto indicesRowVector = args[0];
365 auto arrayVector = indicesRowVector->as<ArrayVector>();
366
367 auto indicesVector = arrayVector->elements();
368 int* indicesValues = indicesVector->values()->asMutable<int>();
369 int numInputs = rows.size();
370
371 std::vector<std::vector<float>> encoding(
372 numInputs, std::vector<float>(size_, 0));
373
374 for (int i = 0; i < numInputs; i++) {
375 int numSubIndices = arrayVector->sizeAt(i);
376 int indicesOffset = arrayVector->offsetAt(i);
377 float value = 1.0 / numSubIndices;
378 for (int j = 0; j < numSubIndices; j++) {
379 int embedIndex = indicesValues[indicesOffset + j];
380 encoding[i][embedIndex] = value;
381 }
382 }
383
384 VectorMaker maker{context.pool()};
385 output = maker.arrayVector<float>(encoding, REAL());
386 }
387
392 static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
393 return {exec::FunctionSignatureBuilder()
394 .argumentType("array(INTEGER)")
395 .returnType("array(REAL)")
396 .build()};
397 }
398
403 static std::string getName() {
404 return "multi_hot_norm_encoder";
405 };
406
411 float* getTensor() const override {
412 // TODO: need to implement
413 return nullptr;
414 }
415
421 CostEstimate getCost(std::vector<int> inputDims) {
422 // TODO: need to implement
423 return CostEstimate(0, inputDims[0], inputDims[1]);
424 }
425
426 private:
427 int size_;
428};
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:95
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:124
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:53
IntEncoder(std::unordered_map< int, int > mapping)
Constructor for IntEncoder.
Definition Encoder.h:41
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:114
static std::string getName()
Returns the name of the function.
Definition Encoder.h:106
A base class for machine learning functions, inheriting from Velox's VectorFunction.
Definition BaseFunction.h:9
MultiHotNormalizedEncoder(int size)
Constructor for MultiHotNormalizedEncoder.
Definition Encoder.h:344
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:411
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:421
static std::string getName()
Returns the name of the function.
Definition Encoder.h:403
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:356
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:392
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:210
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:191
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:155
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:220
StringEncoder(std::unordered_map< std::string, int > mapping)
Constructor for StringEncoder.
Definition Encoder.h:143
static std::string getName()
Returns the name of the function.
Definition Encoder.h:202
StringVariadicEncoder(std::unordered_map< std::string, int > mapping)
Constructor for StringVariadicEncoder.
Definition Encoder.h:239
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition Encoder.h:325
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the encoding function to the input data.
Definition Encoder.h:251
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition Encoder.h:296
float * getTensor() const override
Returns the tensor associated with the function.
Definition Encoder.h:315
static std::string getName()
Returns the name of the function.
Definition Encoder.h:307