From 359873b85eff7d37736cb54158d5c660f243db6f Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Sun, 13 Nov 2016 13:51:56 -0800 Subject: [PATCH 01/13] use pyscaffolding to avoid import of lshash to retrieve __version__ --- lshash/__init__.py | 16 +++++---------- setup.py | 51 +++++++++++++++------------------------------- 2 files changed, 21 insertions(+), 46 deletions(-) diff --git a/lshash/__init__.py b/lshash/__init__.py index e805896..896994c 100644 --- a/lshash/__init__.py +++ b/lshash/__init__.py @@ -1,12 +1,6 @@ -# lshash/__init__.py -# Copyright 2012 Kay Zhu (a.k.a He Zhu) and contributors (see CONTRIBUTORS.txt) -# -# This module is part of lshash and is released under -# the MIT License: http://www.opensource.org/licenses/mit-license.php +import pkg_resources -__title__ = 'lshash' -__author__ = 'Kay Zhu (me@kayzhu.com)' -__license__ = 'MIT' -__version__ = '0.0.4dev' - -from lshash import LSHash +try: + __version__ = pkg_resources.get_distribution(__name__).version +except: + __version__ = 'unknown' diff --git a/setup.py b/setup.py index b0a98e5..a91dd06 100644 --- a/setup.py +++ b/setup.py @@ -1,42 +1,23 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- +""" + Setup file for lshash. -import lshash + This file was generated with PyScaffold 2.5.6, a tool that easily + puts up a scaffold for your new Python project. Learn more under: + http://pyscaffold.readthedocs.org/ +""" -try: - from setuptools import setup -except ImportError: - from distutils.core import setup +import sys +from setuptools import setup -with open('README.rst') as f: - readme = f.read() +def setup_package(): + needs_sphinx = {'build_sphinx', 'upload_docs'}.intersection(sys.argv) + sphinx = ['sphinx'] if needs_sphinx else [] + setup(setup_requires=['six', 'pyscaffold>=2.5a0,<2.6a0'] + sphinx, + use_pyscaffold=True) -with open('LICENSE') as f: - license = f.read() -with open('CHANGES.rst') as f: - changes = f.read() - -required = ['numpy'] - -setup( - name='lshash', - version=lshash.__version__, - packages=['lshash'], - author='Kay Zhu', - author_email='me@kayzhu.com', - maintainer='Kay Zhu', - maintainer_email='me@kayzhu.com', - description='A fast Python implementation of locality sensitive hashing with persistance support.', - long_description=readme + '\n\n' + changes, - license=license, - requires=required, - classifiers=[ - 'Intended Audience :: Developers', - 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Topic :: Software Development :: Libraries', - ], -) +if __name__ == "__main__": + setup_package() From 62a0f30a92a1f8b45107fc1782f13254375561a0 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Sun, 13 Nov 2016 14:09:26 -0800 Subject: [PATCH 02/13] start moving lshash.__init__.__*__ to setup.cfg --- AUTHORS.rst | 5 +++ LICENSE | 20 +----------- lshash/__init__.py | 2 +- setup.cfg | 78 ++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 8 ++--- 5 files changed, 87 insertions(+), 26 deletions(-) create mode 100644 AUTHORS.rst create mode 100644 setup.cfg diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..c6fb717 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,5 @@ +========== +Developers +========== + +* Kay Zhu (a.k.a He Zhu) diff --git a/LICENSE b/LICENSE index f0f1c8c..865a7e8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,19 +1 @@ -Copyright 2012 Kay Zhu (a.k.a He Zhu) - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Copyright 2016 Hobson Lane diff --git a/lshash/__init__.py b/lshash/__init__.py index 896994c..33a96e6 100644 --- a/lshash/__init__.py +++ b/lshash/__init__.py @@ -3,4 +3,4 @@ try: __version__ = pkg_resources.get_distribution(__name__).version except: - __version__ = 'unknown' + __version__ = '0.0.4dev' diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..d6b3cab --- /dev/null +++ b/setup.cfg @@ -0,0 +1,78 @@ +[metadata] + +name = lshash +summary = A fast Python implementation of locality sensitive hashing with persistance (Redis) support. +author = Kay Zhu +author-email = me@kayzhu.com +license = MIT +home-page = http://... +description-file = README.rst +# Add here all kinds of additional classifiers as defined under +# https://pypi.python.org/pypi?%3Aaction=list_classifiers +classifier = + Development Status :: 4 - Beta + Programming Language :: Python + +[entry_points] +# Add here console scripts like: +# console_scripts = +# script_name = lshash.module:function +# For example: +# console_scripts = +# fibonacci = lshash.skeleton:run +# as well as other entry_points. + + +[files] +# Add here 'data_files', 'packages' or 'namespace_packages'. +# Additional data files are defined as key value pairs of source and target: +packages = + lshash +# data_files = +# share/lshash_docs = docs/* + +[extras] +# Add here additional requirements for extra features, like: +# PDF = +# ReportLab>=1.2 +# RXP + +[test] +# py.test options when running `python setup.py test` +addopts = tests + +[pytest] +# Options for py.test: +# Specify command line options as you would do when invoking py.test directly. +# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml +# in order to write a coverage file that can be read by Jenkins. +addopts = + --cov lshash --cov-report term-missing + --verbose + +[aliases] +docs = build_sphinx + +[bdist_wheel] +# Use this option if your package is pure-python +universal = 1 + +[build_sphinx] +source_dir = docs +build_dir = docs/_build + +[pbr] +# Let pbr run sphinx-apidoc +autodoc_tree_index_modules = True +# autodoc_tree_excludes = ... +# Let pbr itself generate the apidoc +# autodoc_index_modules = True +# autodoc_exclude_modules = ... +# Convert warnings to errors +# warnerrors = True + +[devpi:upload] +# Options for the devpi: PyPI server and packaging tool +# VCS export must be deactivated since we are using setuptools-scm +no-vcs = 1 +formats = bdist_wheel diff --git a/setup.py b/setup.py index a91dd06..6765d73 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" - Setup file for lshash. - - This file was generated with PyScaffold 2.5.6, a tool that easily - puts up a scaffold for your new Python project. Learn more under: - http://pyscaffold.readthedocs.org/ +""" Setup file for lshash. + This file was generated with PyScaffold 2.5.6 """ import sys From e38a3a24f7cbc53324015f9069fd25e545f15c09 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Sun, 13 Nov 2016 14:13:38 -0800 Subject: [PATCH 03/13] fix license --- LICENSE | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 865a7e8..f0f1c8c 100644 --- a/LICENSE +++ b/LICENSE @@ -1 +1,19 @@ -Copyright 2016 Hobson Lane +Copyright 2012 Kay Zhu (a.k.a He Zhu) + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From c7c9e0b90e9e610d9289095a9a55765cbed92bf5 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Sun, 13 Nov 2016 14:20:46 -0800 Subject: [PATCH 04/13] add "Extras" for Redis --- setup.cfg | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index d6b3cab..40bc1b3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,9 +33,8 @@ packages = [extras] # Add here additional requirements for extra features, like: -# PDF = -# ReportLab>=1.2 -# RXP +Redis = + redis>=2.10.0 [test] # py.test options when running `python setup.py test` From 995fa42e3d93ac97f42b0320f01309f5e8cdff2b Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Sun, 13 Nov 2016 14:38:48 -0800 Subject: [PATCH 05/13] restore pypi "Categories" from previous setup.py --- setup.cfg | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/setup.cfg b/setup.cfg index 40bc1b3..e88e0b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,6 +12,11 @@ description-file = README.rst classifier = Development Status :: 4 - Beta Programming Language :: Python + Programming Language :: Python :: 2 + Intended Audience :: Developers + License :: OSI Approved :: MIT License + Operating System :: OS Independent + Topic :: Software Development :: Libraries [entry_points] # Add here console scripts like: From 3dda96c025fe33eef4219d33e977bd350ecd3613 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Sun, 13 Nov 2016 16:27:05 -0800 Subject: [PATCH 06/13] some tests --- requirements.txt | 3 +++ setup.cfg | 4 +++- test-requirements.txt | 5 +++++ tests/conftest.py | 12 ++++++++++++ tests/test_float3d.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 requirements.txt create mode 100644 test-requirements.txt create mode 100644 tests/conftest.py create mode 100644 tests/test_float3d.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..50d57fb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +# bitarray +numpy +# redis diff --git a/setup.cfg b/setup.cfg index e88e0b8..dd4d1df 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,9 @@ packages = [extras] # Add here additional requirements for extra features, like: Redis = - redis>=2.10.0 + redis>=2.10.0 +Hamming = + bitarray [test] # py.test options when running `python setup.py test` diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..468f195 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1,5 @@ +# Add requirements only needed for your unittests and during development here. +# They will be installed automatically when running `python setup.py test`. +# ATTENTION: Don't remove pytest-cov and pytest as they are needed. +pytest-cov +pytest diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..aab09f8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + Dummy conftest.py for lshash. + + If you don't know what this is for, just leave it empty. + Read more about conftest.py under: + https://pytest.org/latest/plugins.html +""" +from __future__ import print_function, absolute_import, division + +import pytest diff --git a/tests/test_float3d.py b/tests/test_float3d.py new file mode 100644 index 0000000..f8d547d --- /dev/null +++ b/tests/test_float3d.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pytest +import numpy as np + +from lshash import LSHash + +__author__ = "Hobson Lane" +__copyright__ = "Kay Zhu (a.k.a He Zhu)" +__license__ = "MIT" + + +def test_sphere(): + X = np.random.normal(size=(1000, 3)) + lsh = LSHash(10, 3, num_hashtables=5) + for x in X: + x /= np.linalg.norm(x) + lsh.index(x) + closest = lsh.query(X[0] + np.array([-0.001, 0.001, -0.001]), distance_func="cosine") + assert len(closest) >= 10 + assert 0.05 >= closest[9][-1] > 0.0003 + + +def test_hyperspheres(): + tenthclosest = [] + for D in range(2, 11): + X = np.random.normal(size=(200000, D)) + lsh = LSHash(32, D, num_hashtables=D) + for x in X: + lsh.index(x) + x /= np.linalg.norm(x) + # closest = lsh.query(X[0] + np.array([0.001] * D), distance_func="cosine") + x = np.random.normal(size=(D,)) + x /= np.linalg.norm(x) + closest = lsh.query(x, distance_func='cosine') + N = len(closest) + tenthclosest += [[D, N, closest[min(9, N - 1)][-1] if N else None]] + print(tenthclosest[-1]) + for i, tc in enumerate(tenthclosest): + assert 1e-9 < tc[-1] or 1e-6 < 0.2 + return tenthclosest From 5eb2564f1623223f86baf4a8f4d19afd9e8e5bef Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Sun, 13 Nov 2016 17:02:44 -0800 Subject: [PATCH 07/13] test_spheres while recording distances --- tests/{test_float3d.py => test_spheres.py} | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) rename tests/{test_float3d.py => test_spheres.py} (64%) diff --git a/tests/test_float3d.py b/tests/test_spheres.py similarity index 64% rename from tests/test_float3d.py rename to tests/test_spheres.py index f8d547d..2e0c238 100644 --- a/tests/test_float3d.py +++ b/tests/test_spheres.py @@ -26,17 +26,23 @@ def test_hyperspheres(): tenthclosest = [] for D in range(2, 11): X = np.random.normal(size=(200000, D)) - lsh = LSHash(32, D, num_hashtables=D) + lsh = LSHash(int(64 / D) + D, D, num_hashtables=D) + + # query vector + q = np.random.normal(size=(D,)) + q /= np.linalg.norm(q) + + distances = [] for x in X: lsh.index(x) x /= np.linalg.norm(x) - # closest = lsh.query(X[0] + np.array([0.001] * D), distance_func="cosine") - x = np.random.normal(size=(D,)) - x /= np.linalg.norm(x) - closest = lsh.query(x, distance_func='cosine') + distances += [1 - np.sum(x * q)] + distances = sorted(distances) + closest = lsh.query(q, distance_func='cosine') N = len(closest) - tenthclosest += [[D, N, closest[min(9, N - 1)][-1] if N else None]] + rank = min(10, N) + tenthclosest += [[D, N - 1, closest[rank - 1][-1] if N else None, distances[rank - 1]]] print(tenthclosest[-1]) for i, tc in enumerate(tenthclosest): - assert 1e-9 < tc[-1] or 1e-6 < 0.2 + assert 1e-9 < tc[-2] or 1e-6 < 0.2 return tenthclosest From bb8ce6ac55ec82e512827ccecf81f118736807ed Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Wed, 26 Apr 2017 21:11:25 -0700 Subject: [PATCH 08/13] make tests.test_spheres a module --- lshash/lshash.py | 26 +++++++++++++++++++++++--- requirements.txt | 3 ++- tests/__init__.py | 0 tests/test_spheres.py | 39 ++++++++++++++++++++++++++++----------- 4 files changed, 53 insertions(+), 15 deletions(-) create mode 100644 tests/__init__.py diff --git a/lshash/lshash.py b/lshash/lshash.py index 5c895a6..e72254b 100644 --- a/lshash/lshash.py +++ b/lshash/lshash.py @@ -3,12 +3,26 @@ # # This module is part of lshash and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php +# -*- coding: utf-8 -*- +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import int, round, str, object # noqa +from future import standard_library +standard_library.install_aliases() # noqa: Counter, OrderedDict, +from past.builtins import basestring # noqa: + +import future # noqa +import builtins # noqa +import past # noqa +import six # noqa import os import json import numpy as np -from storage import storage +try: + from storage import storage # py2 +except ImportError: + from .storage import storage # py3 try: from bitarray import bitarray @@ -16,6 +30,12 @@ bitarray = None +try: + xrange # py2 +except NameError: + xrange = range # py3 + + class LSHash(object): """ LSHash implments locality sensitive hashing using random projection for input vectors of dimension `input_dim`. @@ -263,7 +283,7 @@ def query(self, query_point, num_results=None, distance_func=None): # rank candidates by distance function candidates = [(ix, d_func(query_point, self._as_np_array(ix))) for ix in candidates] - candidates.sort(key=lambda x: x[1]) + candidates = sorted(candidates, key=lambda x: x[1]) return candidates[:num_results] if num_results else candidates @@ -298,4 +318,4 @@ def l1norm_dist(x, y): @staticmethod def cosine_dist(x, y): - return 1 - np.dot(x, y) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5) + return 1 - float(np.dot(x, y)) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5) diff --git a/requirements.txt b/requirements.txt index 50d57fb..373f4a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -# bitarray +bitarray numpy +storage # redis diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_spheres.py b/tests/test_spheres.py index 2e0c238..988e124 100644 --- a/tests/test_spheres.py +++ b/tests/test_spheres.py @@ -1,10 +1,18 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import int, round, str, object # noqa +from future import standard_library +standard_library.install_aliases() # noqa: Counter, OrderedDict, +from past.builtins import basestring # noqa: -import pytest import numpy as np -from lshash import LSHash +import future # noqa +import builtins # noqa +import past # noqa +import six # noqa +from lshash.lshash import LSHash __author__ = "Hobson Lane" __copyright__ = "Kay Zhu (a.k.a He Zhu)" @@ -22,27 +30,36 @@ def test_sphere(): assert 0.05 >= closest[9][-1] > 0.0003 -def test_hyperspheres(): +def test_hyperspheres(X=np.random.uniform(size=(200000, 10))): + """ Demonstrate curse of dimensionality and where LSH starts to fail + + Returns: + lsh, X, secondclosest, tenthclosest + + """ tenthclosest = [] - for D in range(2, 11): - X = np.random.normal(size=(200000, D)) - lsh = LSHash(int(64 / D) + D, D, num_hashtables=D) + secondclosest = [] + for D in range(2, X.shape[1]): + lsh = LSHash(int(512. / D) + 1, D, num_hashtables=D) # query vector - q = np.random.normal(size=(D,)) + q = np.random.uniform(size=(D,)) q /= np.linalg.norm(q) distances = [] - for x in X: + for x in X[:, :D]: lsh.index(x) x /= np.linalg.norm(x) - distances += [1 - np.sum(x * q)] + distances += [1. - np.sum(x * q)] # cosine similarity distances = sorted(distances) - closest = lsh.query(q, distance_func='cosine') + closest = lsh.query(q, num_results=10, distance_func='cosine') + N = len(closest) rank = min(10, N) tenthclosest += [[D, N - 1, closest[rank - 1][-1] if N else None, distances[rank - 1]]] print(tenthclosest[-1]) + secondclosest += [[D, N - 1, closest[min(2, N) - 1][-1] if N else None, distances[rank - 1]]] + print(secondclosest[-1]) for i, tc in enumerate(tenthclosest): assert 1e-9 < tc[-2] or 1e-6 < 0.2 - return tenthclosest + return lsh, X, secondclosest, tenthclosest From c29e0b1206e228cacb99fb3bed2196a6a7752192 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Wed, 26 Apr 2017 21:21:30 -0700 Subject: [PATCH 09/13] change name to lshash3 --- requirements.txt | 1 - setup.cfg | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 373f4a5..39c78fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ bitarray numpy -storage # redis diff --git a/setup.cfg b/setup.cfg index dd4d1df..63c4592 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,6 +13,7 @@ classifier = Development Status :: 4 - Beta Programming Language :: Python Programming Language :: Python :: 2 + Programming Language :: Python :: 3 Intended Audience :: Developers License :: OSI Approved :: MIT License Operating System :: OS Independent @@ -32,7 +33,7 @@ classifier = # Add here 'data_files', 'packages' or 'namespace_packages'. # Additional data files are defined as key value pairs of source and target: packages = - lshash + lshash3 # data_files = # share/lshash_docs = docs/* From cc60e32318fb73f8bec601489f857dc7c49ab441 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Wed, 26 Apr 2017 21:25:13 -0700 Subject: [PATCH 10/13] name lshash3 package lshash --- setup.cfg | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 63c4592..98a1dc4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] -name = lshash +name = lshash3 summary = A fast Python implementation of locality sensitive hashing with persistance (Redis) support. author = Kay Zhu author-email = me@kayzhu.com @@ -33,7 +33,8 @@ classifier = # Add here 'data_files', 'packages' or 'namespace_packages'. # Additional data files are defined as key value pairs of source and target: packages = - lshash3 + lshash + tests # data_files = # share/lshash_docs = docs/* From c57a145a45e1fccceb37a1f4a3ce154a773df613 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Wed, 26 Apr 2017 23:47:48 -0700 Subject: [PATCH 11/13] broke test_spheres.py --- AUTHORS.rst | 1 + CHANGES.rst | 4 ++++ tests/test_spheres.py | 12 +++++++----- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index c6fb717..95b6eb6 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -3,3 +3,4 @@ Developers ========== * Kay Zhu (a.k.a He Zhu) +* Hobson Lane diff --git a/CHANGES.rst b/CHANGES.rst index d73c038..dad875a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,7 @@ +v0.0.8 2017/04/28 -- `$ pip install lshash3`, `>>> import lshash.lshash.LSHash` because it's Python3 compatible! +v0.0.7 2017/04/28 -- Package name is now lshash3 because it's Python3 compatible! Demo hyperspheres in tests. +v0.0.6 2016/12/01 -- PyScaffolding's setup.cfg eliminates `import lshash` in setup.py but still fails for an Ubuntu python3 virtualenv +v0.0.5 2016/12/01 -- PyScaffolding's setup.cfg eliminates `import lshash` in setup.py v0.0.3, 2012/12/28 -- Doc fixes. v0.0.2, 2012/12/28 -- Doc fixes and lowercase package name. v0.0.1, 2012/12/20 -- Initial release. diff --git a/tests/test_spheres.py b/tests/test_spheres.py index 988e124..dec798d 100644 --- a/tests/test_spheres.py +++ b/tests/test_spheres.py @@ -31,7 +31,7 @@ def test_sphere(): def test_hyperspheres(X=np.random.uniform(size=(200000, 10))): - """ Demonstrate curse of dimensionality and where LSH starts to fail + """ Demonstrate curse of dimensionality and where LSH starts to fail Returns: lsh, X, secondclosest, tenthclosest @@ -56,10 +56,12 @@ def test_hyperspheres(X=np.random.uniform(size=(200000, 10))): N = len(closest) rank = min(10, N) - tenthclosest += [[D, N - 1, closest[rank - 1][-1] if N else None, distances[rank - 1]]] - print(tenthclosest[-1]) - secondclosest += [[D, N - 1, closest[min(2, N) - 1][-1] if N else None, distances[rank - 1]]] - print(secondclosest[-1]) + tenthclosest += [[D, min(10, N), closest[rank - 1][-1] if N else None, distances[rank - 1]]] + secondclosest += [[D, min(2, N), closest[min(2, N) - 1][-1] if N else None, distances[min(2, N) - 1]]] + for row in tenthclosest: + print(row) + for row in secondclosest: + print(row) for i, tc in enumerate(tenthclosest): assert 1e-9 < tc[-2] or 1e-6 < 0.2 return lsh, X, secondclosest, tenthclosest From 6cc2f25c9da02e3a65a05119ca873df65aee7d20 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Wed, 26 Apr 2017 23:52:48 -0700 Subject: [PATCH 12/13] revert test_spheres --- tests/test_spheres.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_spheres.py b/tests/test_spheres.py index dec798d..988e124 100644 --- a/tests/test_spheres.py +++ b/tests/test_spheres.py @@ -31,7 +31,7 @@ def test_sphere(): def test_hyperspheres(X=np.random.uniform(size=(200000, 10))): - """ Demonstrate curse of dimensionality and where LSH starts to fail + """ Demonstrate curse of dimensionality and where LSH starts to fail Returns: lsh, X, secondclosest, tenthclosest @@ -56,12 +56,10 @@ def test_hyperspheres(X=np.random.uniform(size=(200000, 10))): N = len(closest) rank = min(10, N) - tenthclosest += [[D, min(10, N), closest[rank - 1][-1] if N else None, distances[rank - 1]]] - secondclosest += [[D, min(2, N), closest[min(2, N) - 1][-1] if N else None, distances[min(2, N) - 1]]] - for row in tenthclosest: - print(row) - for row in secondclosest: - print(row) + tenthclosest += [[D, N - 1, closest[rank - 1][-1] if N else None, distances[rank - 1]]] + print(tenthclosest[-1]) + secondclosest += [[D, N - 1, closest[min(2, N) - 1][-1] if N else None, distances[rank - 1]]] + print(secondclosest[-1]) for i, tc in enumerate(tenthclosest): assert 1e-9 < tc[-2] or 1e-6 < 0.2 return lsh, X, secondclosest, tenthclosest From 83a568271c5d69aa7960bbdedba81254b5c44027 Mon Sep 17 00:00:00 2001 From: Hobson Lane Date: Wed, 26 Apr 2017 23:53:12 -0700 Subject: [PATCH 13/13] closest, second and tenth --- tests/test_spheres.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/tests/test_spheres.py b/tests/test_spheres.py index 988e124..9421662 100644 --- a/tests/test_spheres.py +++ b/tests/test_spheres.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +#!usr/bin/env python3 from __future__ import print_function, unicode_literals, division, absolute_import from builtins import int, round, str, object # noqa from future import standard_library @@ -30,17 +29,22 @@ def test_sphere(): assert 0.05 >= closest[9][-1] > 0.0003 -def test_hyperspheres(X=np.random.uniform(size=(200000, 10))): - """ Demonstrate curse of dimensionality and where LSH starts to fail +def hyperspheres_10D(X=np.random.uniform(size=(200000, 10))): + """ Demonstrate curse of dimensionality and where LSH starts to fail Returns: lsh, X, secondclosest, tenthclosest + >>> import pandas as pd + >>> lsh, vectors, rank1, rank2, rank10 = test_hyperspheres() + >>> pd.DataFrame(rank2) + >>> pd.DataFrame(rank10) """ tenthclosest = [] secondclosest = [] + closest = [] for D in range(2, X.shape[1]): - lsh = LSHash(int(512. / D) + 1, D, num_hashtables=D) + lsh = LSHash(int(1024 * 8182. / D) + D, D, num_hashtables=D) # query vector q = np.random.uniform(size=(D,)) @@ -52,14 +56,16 @@ def test_hyperspheres(X=np.random.uniform(size=(200000, 10))): x /= np.linalg.norm(x) distances += [1. - np.sum(x * q)] # cosine similarity distances = sorted(distances) - closest = lsh.query(q, num_results=10, distance_func='cosine') + print(distances[:10]) + closest10 = lsh.query(q, distance_func='cosine') - N = len(closest) - rank = min(10, N) - tenthclosest += [[D, N - 1, closest[rank - 1][-1] if N else None, distances[rank - 1]]] + N = len(closest10) + tenthclosest += [[D, min(9, N - 1) if N else -1, closest10[min(9, N - 1)][-1] if N else 2., distances[min(9, N - 1)]]] + secondclosest += [[D, min(1, N - 1) if N else -1, closest10[min(1, N - 1)][-1] if N else 2., distances[min(1, N - 1)]]] + closest += [[D, 0 if N else -1, closest10[0][-1] if N else 2., distances[0]]] print(tenthclosest[-1]) - secondclosest += [[D, N - 1, closest[min(2, N) - 1][-1] if N else None, distances[rank - 1]]] print(secondclosest[-1]) - for i, tc in enumerate(tenthclosest): - assert 1e-9 < tc[-2] or 1e-6 < 0.2 - return lsh, X, secondclosest, tenthclosest + print(closest[-1]) + # for i, tc in enumerate(tenthclosest): + # assert 1e-9 < tc[-2] or 1e-6 < 0.2 + return lsh, X, closest, secondclosest, tenthclosest