From 844328059ef44baba689f5335219dd69b22bb0df Mon Sep 17 00:00:00 2001 From: Sagar Dollin Date: Sun, 29 Aug 2021 01:42:01 +0530 Subject: [PATCH] Updated RandomState (deprecated from numpy) to default_rng (Generator) This is regarding the issue #2782 . Here are the benchmarks of before and after updating: Before Update After Update Poincare Ran 42 tests in 0.418s Ran 42 tests in 0.417s test_lda Ran 48 tests in 223.845s Ran 48 tests in 225.561s utils Ran 24 tests in 0.007s Ran 24 tests in 0.007s test_matutils Ran 18 tests in 0.071s Ran 18 tests in 0.070s word2vec Ran 79 tests in 58.149s Ran 79 tests in 57.950s I don't find a big difference in time taken. However I feel it is good to be updated along with numpy. --- gensim/models/ldamodel.py | 2 +- gensim/models/poincare.py | 7 +++++-- gensim/models/test_poincare.py | 14 ++++++++++++++ gensim/models/word2vec.py | 2 +- gensim/models/word2vec_inner.pyx | 21 ++++++++++++++++----- gensim/test/test_ldamodel.py | 8 ++++---- gensim/test/test_matutils.py | 2 +- gensim/utils.py | 6 +++--- 8 files changed, 45 insertions(+), 17 deletions(-) create mode 100644 gensim/models/test_poincare.py diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 6691ddcc31..ad58efec3c 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -1174,7 +1174,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): num_topics = min(num_topics, self.num_topics) # add a little random jitter, to randomize results around the same alpha - sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha)) + sort_alpha = self.alpha + 0.0001 * self.random_state.integers(low=0, high=1, size=len(self.alpha)) # random_state.rand returns float64, but converting back to dtype won't speed up anything sorted_topics = list(matutils.argsort(sort_alpha)) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 136fd6b6d5..a37f35e91a 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -164,7 +164,7 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil self._burn_in_done = False self.dtype = dtype self.seed = seed - self._np_random = np_random.RandomState(seed) + self._np_random = np_random.default_rng(seed) self.init_range = init_range self._loss_grad = None self.build_vocab(train_data) @@ -264,7 +264,10 @@ def _get_candidate_negatives(self): # this is to avoid floating point errors that result when the number of nodes is very high # for reference: https://github.com/RaRe-Technologies/gensim/issues/1917 max_cumsum_value = self._node_counts_cumsum[-1] - uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size) + if isinstance(self._np_random, np.random.Generator): + uniform_numbers = self._np_random.integers(1, max_cumsum_value + 1, self._negatives_buffer_size) + else: + uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size) cumsum_table_indices = np.searchsorted(self._node_counts_cumsum, uniform_numbers) self._negatives_buffer = NegativesBuffer(cumsum_table_indices) return self._negatives_buffer.get_items(self.negative) diff --git a/gensim/models/test_poincare.py b/gensim/models/test_poincare.py new file mode 100644 index 0000000000..0adaafe4c9 --- /dev/null +++ b/gensim/models/test_poincare.py @@ -0,0 +1,14 @@ +from poincare import PoincareModel, PoincareRelations +from time import time +import numpy as np +t1 = time() +file_path = "C:\\Users\\sagar\\gensim\\gensim\\test\\test_data\\poincare_hypernyms_large.tsv" +model = PoincareModel(PoincareRelations(file_path), negative=2) +model.train(epochs=50) +t2 = time() +print(t2-t1) +#print((np.random.randint.__doc__)) + + +print(np.random.RandomState.rand.__doc__) +print(np.random.default_rng(1).gamma.__doc__) \ No newline at end of file diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 356f711408..d338c00517 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -384,7 +384,7 @@ def __init__( self.window = int(window) self.shrink_windows = bool(shrink_windows) - self.random = np.random.RandomState(seed) + self.random = np.random.default_rng(seed) self.hs = int(hs) self.negative = int(negative) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index ffdc908b5c..7e0b3e98af 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -489,7 +489,10 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1 c[0].cum_table = (np.PyArray_DATA(model.cum_table)) c[0].cum_table_len = len(model.cum_table) if c[0].negative or c[0].sample: - c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) + if isinstance(model.random, np.random.Generator): + c[0].next_random = (2**24) * model.random.integers(0, 2**24) + model.random.integers(0, 2**24) + else: + c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # convert Python structures to primitive types, so we can release the GIL c[0].work = np.PyArray_DATA(_work) @@ -567,8 +570,12 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): # precompute "reduced window" offsets in a single randint() call if model.shrink_windows: - for i, item in enumerate(model.random.randint(0, c.window, effective_words)): - c.reduced_windows[i] = item + if isinstance(model.random, np.random.Generator): + for i, item in enumerate(model.random.integers(0, c.window, effective_words)): + c.reduced_windows[i] = item + else: + for i, item in enumerate(model.random.randint(0, c.window, effective_words)): + c.reduced_windows[i] = item else: for i in range(effective_words): c.reduced_windows[i] = 0 @@ -667,8 +674,12 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): # precompute "reduced window" offsets in a single randint() call if model.shrink_windows: - for i, item in enumerate(model.random.randint(0, c.window, effective_words)): - c.reduced_windows[i] = item + if isinstance(model.random, np.random.Generator): + for i, item in enumerate(model.random.integers(0, c.window, effective_words)): + c.reduced_windows[i] = item + else: + for i, item in enumerate(model.random.randint(0, c.window, effective_words)): + c.reduced_windows[i] = item else: for i in range(effective_words): c.reduced_windows[i] = 0 diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index b809b39754..1376cbb933 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -31,9 +31,9 @@ def test_random_state(): - testcases = [np.random.seed(0), None, np.random.RandomState(0), 0] + testcases = [np.random.seed(0), None, np.random.default_rng(0), 0] for testcase in testcases: - assert(isinstance(utils.get_random_state(testcase), np.random.RandomState)) + assert(isinstance(utils.get_random_state(testcase), np.random.Generator)) class TestLdaModel(unittest.TestCase, basetmtests.TestBaseTopicModel): @@ -51,8 +51,8 @@ def test_sync_state(self): assert_allclose(self.model.get_topics(), model2.get_topics(), rtol=1e-5) # properly continues training on the new state - self.model.random_state = np.random.RandomState(0) - model2.random_state = np.random.RandomState(0) + self.model.random_state = np.random.default_rng(0) + model2.random_state = np.random.default_rng(0) self.model.passes = 1 model2.passes = 1 self.model.update(self.corpus) diff --git a/gensim/test/test_matutils.py b/gensim/test/test_matutils.py index 97e4189d89..a834af0e64 100644 --- a/gensim/test/test_matutils.py +++ b/gensim/test/test_matutils.py @@ -86,7 +86,7 @@ def dirichlet_expectation(alpha): class TestLdaModelInner(unittest.TestCase): def setUp(self): - self.random_state = np.random.RandomState() + self.random_state = np.random.default_rng() self.num_runs = 100 # test functions with *num_runs* random inputs self.num_topics = 100 diff --git a/gensim/utils.py b/gensim/utils.py index 30b6d85f58..47c665a9b0 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -86,10 +86,10 @@ def get_random_state(seed): """ if seed is None or seed is np.random: - return np.random.mtrand._rand + return np.random.default_rng() if isinstance(seed, (numbers.Integral, np.integer)): - return np.random.RandomState(seed) - if isinstance(seed, np.random.RandomState): + return np.random.default_rng(seed) + if isinstance(seed, np.random.Generator): return seed raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)