ML functions
 
Loading...
Searching...
No Matches
HuggingFaceTokenizer.h
Go to the documentation of this file.
1
15
16#pragma once
17
18#include <fmt/format.h>
19#include <tokenizers_cpp.h>
20#include <iostream>
21#include "BaseFunction.h"
22#include "velox/exec/tests/utils/AssertQueryBuilder.h"
23#include "velox/exec/tests/utils/PlanBuilder.h"
24#include "velox/exec/tests/utils/TempDirectoryPath.h"
25#include "velox/ml_functions/UtilFunction.h"
26#include "velox/vector/tests/utils/VectorTestBase.h"
27
28using tokenizers::Tokenizer;
29using namespace facebook::velox;
30using namespace facebook::velox::test;
31using namespace facebook::velox::exec::test;
32using namespace facebook::velox::memory;
33
39 public:
44 HuggingFaceTokenizer(std::string pathToTokenizer) {
45 pathToTokenizer_ = pathToTokenizer;
46 auto blob = LoadBytesFromFile(pathToTokenizer_);
47 tokenizer_ = Tokenizer::FromBlobJSON(blob);
48 }
49
58 void apply(
59 const SelectivityVector& rows,
60 std::vector<VectorPtr>& args,
61 const TypePtr& type,
62 exec::EvalCtx& context,
63 VectorPtr& output) const override {
64 BaseVector::ensureWritable(rows, type, context.pool(), output);
65
66 // Read string input
67 exec::LocalDecodedVector decodedStringHolder(context, *args[0], rows);
68 auto decodedStringInput = decodedStringHolder.get();
69 int numInputs = rows.size();
70
71 std::vector<std::vector<int>> result;
72 for (int i = 0; i < numInputs; i++) {
73 StringView val = decodedStringInput->valueAt<StringView>(i);
74 std::vector<int> ids = tokenizer_->Encode(val);
75
76 result.push_back(ids);
77 }
78 VectorMaker maker{context.pool()};
79 output = maker.arrayVector<int>(result, INTEGER());
80 }
81
86 static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
87 return {exec::FunctionSignatureBuilder()
88 .argumentType("VARCHAR")
89 .returnType("array(INTEGER)")
90 .build()};
91 }
92
97 static std::string getName() {
98 return "hf_tokenizer";
99 };
100
105 float* getTensor() const override {
106 // TODO: need to implement
107 return nullptr;
108 }
109
115 CostEstimate getCost(std::vector<int> inputDims) {
116 // TODO: need to implement
117 return CostEstimate(0, inputDims[0], inputDims[1]);
118 }
119
120 private:
121 std::string pathToTokenizer_;
122 std::unique_ptr<Tokenizer> tokenizer_;
123};
std::string LoadBytesFromFile(const std::string &path)
Loads bytes from a file into a string.
Definition UtilFunction.h:452
CostEstimate getCost(std::vector< int > inputDims)
Estimates the cost of the function.
Definition HuggingFaceTokenizer.h:115
HuggingFaceTokenizer(std::string pathToTokenizer)
Constructor for HuggingFaceTokenizer.
Definition HuggingFaceTokenizer.h:44
void apply(const SelectivityVector &rows, std::vector< VectorPtr > &args, const TypePtr &type, exec::EvalCtx &context, VectorPtr &output) const override
Applies the tokenizer function to the input data.
Definition HuggingFaceTokenizer.h:58
static std::vector< std::shared_ptr< exec::FunctionSignature > > signatures()
Returns the function signatures.
Definition HuggingFaceTokenizer.h:86
float * getTensor() const override
Returns the tensor associated with the function.
Definition HuggingFaceTokenizer.h:105
static std::string getName()
Returns the name of the function.
Definition HuggingFaceTokenizer.h:97
A base class for machine learning functions, inheriting from Velox's VectorFunction.
Definition BaseFunction.h:9