18#include <fmt/format.h>
19#include <tokenizers_cpp.h>
21#include "BaseFunction.h"
22#include "velox/exec/tests/utils/AssertQueryBuilder.h"
23#include "velox/exec/tests/utils/PlanBuilder.h"
24#include "velox/exec/tests/utils/TempDirectoryPath.h"
25#include "velox/ml_functions/UtilFunction.h"
26#include "velox/vector/tests/utils/VectorTestBase.h"
28using tokenizers::Tokenizer;
29using namespace facebook::velox;
30using namespace facebook::velox::test;
31using namespace facebook::velox::exec::test;
32using namespace facebook::velox::memory;
45 pathToTokenizer_ = pathToTokenizer;
47 tokenizer_ = Tokenizer::FromBlobJSON(blob);
59 const SelectivityVector& rows,
60 std::vector<VectorPtr>& args,
62 exec::EvalCtx& context,
63 VectorPtr& output)
const override {
64 BaseVector::ensureWritable(rows, type, context.pool(), output);
67 exec::LocalDecodedVector decodedStringHolder(context, *args[0], rows);
68 auto decodedStringInput = decodedStringHolder.get();
69 int numInputs = rows.size();
71 std::vector<std::vector<int>> result;
72 for (
int i = 0; i < numInputs; i++) {
73 StringView val = decodedStringInput->valueAt<StringView>(i);
74 std::vector<int> ids = tokenizer_->Encode(val);
76 result.push_back(ids);
78 VectorMaker maker{context.pool()};
79 output = maker.arrayVector<
int>(result, INTEGER());
86 static std::vector<std::shared_ptr<exec::FunctionSignature>>
signatures() {
87 return {exec::FunctionSignatureBuilder()
88 .argumentType(
"VARCHAR")
89 .returnType(
"array(INTEGER)")
98 return "hf_tokenizer";
115 CostEstimate
getCost(std::vector<int> inputDims) {
117 return CostEstimate(0, inputDims[0], inputDims[1]);
121 std::string pathToTokenizer_;
122 std::unique_ptr<Tokenizer> tokenizer_;
std::string LoadBytesFromFile(const std::string &path)
Loads bytes from a file into a string.
Definition UtilFunction.h:452
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition HuggingFaceTokenizer.h:115
HuggingFaceTokenizer(std::string pathToTokenizer)
Constructor for HuggingFaceTokenizer.
Definition HuggingFaceTokenizer.h:44
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the tokenizer function to the input data.
Definition HuggingFaceTokenizer.h:58
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition HuggingFaceTokenizer.h:86
float * getTensor() const override
Returns the tensor associated with the function.
Definition HuggingFaceTokenizer.h:105
static std::string getName()
Returns the name of the function.
Definition HuggingFaceTokenizer.h:97
A base class for machine learning functions, inheriting from Velox's VectorFunction.
Definition BaseFunction.h:9