-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
159 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
defmodule Scholar.FeatureExtraction.CountVectorizer do | ||
@moduledoc """ | ||
A `CountVectorizer` converts already indexed collection of text documents to a matrix of token counts. | ||
""" | ||
import Nx.Defn | ||
|
||
opts_schema = [ | ||
max_token_id: [ | ||
type: :pos_integer, | ||
required: true, | ||
doc: ~S""" | ||
Maximum token id in the input tensor. | ||
""" | ||
] | ||
] | ||
|
||
@opts_schema NimbleOptions.new!(opts_schema) | ||
|
||
@doc """ | ||
Generates a count matrix where each row corresponds to a document in the input corpus, | ||
and each column corresponds to a unique token in the vocabulary of the corpus. | ||
The input must be a 2D tensor where: | ||
* Each row represents a document. | ||
* Each document has integer values representing tokens. | ||
The same number represents the same token in the vocabulary. Tokens should start from 0 | ||
and be consecutive. Negative values are ignored, making them suitable for padding. | ||
## Options | ||
#{NimbleOptions.docs(@opts_schema)} | ||
## Examples | ||
iex> t = Nx.tensor([[0, 1, 2], [1, 3, 4]]) | ||
iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t, max_token_id: Scholar.FeatureExtraction.CountVectorizer.max_token_id(t)) | ||
Nx.tensor([ | ||
[1, 1, 1, 0, 0], | ||
[0, 1, 0, 1, 1] | ||
]) | ||
With padding: | ||
iex> t = Nx.tensor([[0, 1, -1], [1, 3, 4]]) | ||
iex> Scholar.FeatureExtraction.CountVectorizer.fit_transform(t, max_token_id: Scholar.FeatureExtraction.CountVectorizer.max_token_id(t)) | ||
Nx.tensor([ | ||
[1, 1, 0, 0, 0], | ||
[0, 1, 0, 1, 1] | ||
]) | ||
""" | ||
deftransform fit_transform(tensor, opts \\ []) do | ||
fit_transform_n(tensor, NimbleOptions.validate!(opts, @opts_schema)) | ||
end | ||
|
||
@doc """ | ||
Computes the max_token_id option from given tensor. | ||
This function cannot be called inside `defn` (and it will raise | ||
if you try to do so). | ||
## Examples | ||
iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) | ||
iex> Scholar.FeatureExtraction.CountVectorizer.max_token_id(t) | ||
2 | ||
""" | ||
def max_token_id(tensor) do | ||
tensor |> Nx.reduce_max() |> Nx.to_number() | ||
end | ||
|
||
defnp fit_transform_n(tensor, opts) do | ||
check_for_rank(tensor) | ||
counts = Nx.broadcast(0, {Nx.axis_size(tensor, 0), opts[:max_token_id] + 1}) | ||
|
||
{_, counts} = | ||
while {{i = 0, tensor}, counts}, Nx.less(i, Nx.axis_size(tensor, 0)) do | ||
{_, counts} = | ||
while {{j = 0, i, tensor}, counts}, Nx.less(j, Nx.axis_size(tensor, 1)) do | ||
index = tensor[i][j] | ||
|
||
counts = | ||
if Nx.any(Nx.less(index, 0)), | ||
do: counts, | ||
else: Nx.indexed_add(counts, Nx.stack([i, index]), 1) | ||
|
||
{{j + 1, i, tensor}, counts} | ||
end | ||
|
||
{{i + 1, tensor}, counts} | ||
end | ||
|
||
counts | ||
end | ||
|
||
defnp check_for_rank(tensor) do | ||
if Nx.rank(tensor) != 2 do | ||
raise ArgumentError, | ||
""" | ||
expected tensor to have shape {num_documents, num_tokens}, \ | ||
got tensor with shape: #{inspect(Nx.shape(tensor))}\ | ||
""" | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
defmodule Scholar.Preprocessing.CountVectorizer do | ||
use Scholar.Case, async: true | ||
alias Scholar.FeatureExtraction.CountVectorizer | ||
doctest CountVectorizer | ||
|
||
describe "fit_transform" do | ||
test "fit_transform test" do | ||
tesnsor = Nx.tensor([[2, 3, 0], [1, 4, 4]]) | ||
|
||
counts = | ||
CountVectorizer.fit_transform(tesnsor, | ||
max_token_id: CountVectorizer.max_token_id(tesnsor) | ||
) | ||
|
||
expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 2]]) | ||
|
||
assert counts == expected_counts | ||
end | ||
|
||
test "fit_transform test - tensor with padding" do | ||
tensor = Nx.tensor([[2, 3, 0], [1, 4, -1]]) | ||
|
||
counts = | ||
CountVectorizer.fit_transform(tensor, max_token_id: CountVectorizer.max_token_id(tensor)) | ||
|
||
expected_counts = Nx.tensor([[1, 0, 1, 1, 0], [0, 1, 0, 0, 1]]) | ||
|
||
assert counts == expected_counts | ||
end | ||
end | ||
|
||
describe "max_token_id" do | ||
test "max_token_id test" do | ||
tensor = Nx.tensor([[2, 3, 0], [1, 4, 4]]) | ||
assert CountVectorizer.max_token_id(tensor) == 4 | ||
end | ||
|
||
test "max_token_id tes - tensor with padding" do | ||
tensor = Nx.tensor([[2, 3, 0], [1, 4, -1]]) | ||
assert CountVectorizer.max_token_id(tensor) == 4 | ||
end | ||
end | ||
|
||
describe "errors" do | ||
test "wrong input rank" do | ||
assert_raise ArgumentError, | ||
"expected tensor to have shape {num_documents, num_tokens}, got tensor with shape: {3}", | ||
fn -> | ||
CountVectorizer.fit_transform(Nx.tensor([1, 2, 3]), max_token_id: 3) | ||
end | ||
end | ||
end | ||
end |