diff --git a/.gitignore b/.gitignore index b6e4761..01314e2 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,4 @@ dmypy.json # Pyre type checker .pyre/ +.jekyll-cache diff --git a/Makefile b/Makefile index 3a9f5d3..f56638b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Modified from Makefile of CoverTree # https://github.com/manzilzaheer/CoverTree -# +# # Copyright (c) 2017 Manzil Zaheer All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -24,7 +24,7 @@ CLEAN_PROGS = $(subst $(CURR_DIR)/src/,clean-,$(SOURCES)) CTYPE = gcc -.PHONY: all dir compile $(SOURCES) +.PHONY: all dir compile $(SOURCES) all: dir compile py @@ -45,7 +45,7 @@ dir: @echo Setting up directories @mkdir -p $(BUILDDIR) @mkdir -p dist - + compile: $(SOURCES) @@ -65,3 +65,15 @@ $(PROGS): % : $(CURR_DIR)/src/%/makefile $(CLEAN_PROGS): clean-% : $(CURR_DIR)/src/%/makefile rm -rf build/$(subst clean-,,$@) rm -rf dist/$(subst clean-,,$@) + + +docs-build: + cp README.md homepage/_includes/ + cd homepage && bundle exec jekyll build -d ../docs/build/html + sphinx-build -b html docs/source/ docs/build/html/documentation + +docs-serve: docs-build + cd homepage && bundle exec jekyll serve -d ../docs/build/html --skip-initial-build + +docs-deploy: docs-build + cd docs && make gh-deploy diff --git a/README.md b/README.md index 8c4db24..5b4b7ff 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,12 @@ make pip install build python -m build --wheel # which can be used as: -# pip install --force dist/graphgrove-0.0.1-cp37-cp37m-linux_x86_64.whl +# pip install --force dist/graphgrove-0.0.1-cp37-cp37m-linux_x86_64.whl ``` ## Examples -Toy examples of [clustering](examples/clustering.py), [DAG-structured clustering](examples/dag_clustering.py), and [nearest neighbor search](examples/nearest_neighbor_search.py) are available. +Toy examples of [clustering](https://github.com/nmonath/graphgrove/blob/main/examples/clustering.py), [DAG-structured clustering](https://github.com/nmonath/graphgrove/blob/main/examples/dag_clustering.py), and [nearest neighbor search](https://github.com/nmonath/graphgrove/blob/main/examples/nearest_neighbor_search.py) are available. At a high level, incremental clustering can be done as: @@ -57,7 +57,7 @@ cores=4 tree = gg.graph_builder.Cosine_SGTree(k=k, cores=cores) # data_batches - generator of numpy matrices mini-batch-size by dim for batch in data_batches: - tree.insert(batch) # or tree.insert_and_knn(batch) + tree.insert(batch) # or tree.insert_and_knn(batch) ``` ## Algorithms Implemented @@ -70,3 +70,9 @@ Clustering: Nearest Neighbor Search: * CoverTree: Alina Beygelzimer, Sham Kakade, and John Langford. "Cover trees for nearest neighbor." ICML. 2006. * SGTree: SG-Tree is a new data structure for exact nearest neighbor search inspired from Cover Tree and its improvement, which has been used in the TerraPattern project. At a high level, SG-Tree tries to create a hierarchical tree where each node performs a "coarse" clustering. The centers of these "clusters" become the children and subsequent insertions are recursively performed on these children. When performing the NN query, we prune out solutions based on a subset of the dimensions that are being queried. This is particularly useful when trying to find the nearest neighbor in highly clustered subset of the data, e.g. when the data comes from a recursive mixture of Gaussians or more generally time marginalized coalscent process . The effect of these two optimizations is that our data structure is extremely simple, highly parallelizable and is comparable in performance to existing NN implementations on many data-sets. Manzil Zaheer, Guru Guruganesh, Golan Levin, Alexander Smola. [TerraPattern: A Nearest Neighbor Search Service](http://manzil.ml/res/Papers/2019_sgtree.pdf). 2019. + +## Credits + +Special thanks to the following contributors: + +- Andrew Drozdov ([@mrdrozdov](https://github.com/mrdrozdov)) diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..47ba87b --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,24 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +gh-deploy: + @make html + @ghp-import build/html -p -o -n + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..6ad0bf7 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,24 @@ +Documentation is built using sphinx. + +``` +pip install sphinx +pip install sphinx-rtd-theme # Theme for "Read the Docs". +pip install ghp-import # For publishing to github pages. +pip install m2r2 # For importing markdown files (i.e. README.md). +``` + +Build documentation: + +``` +sphinx-build -b html docs/source/ docs/build/html/documentation +``` + +Deploy to github pages: + +``` +cd docs && make gh-deploy +``` + +Additional notes: + +- It's recommended to write docstrings in Google style. https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html#google-vs-numpy diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..6fcf05b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..1b304ed --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,65 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys + +# Adding missing paths. +modules_path = os.path.abspath('../../.') +print(f'ADDING PATH: {modules_path}') +sys.path.insert(0, modules_path) + +# Mock tricky modules. +autodoc_mock_imports = ['covertreec', 'llamac', 'sccc', 'sgtreec'] + + +# -- Project information ----------------------------------------------------- + +project = 'graphgrove' +copyright = '2021, Nicholas Monath' +author = 'Nicholas Monath' + +# The full version, including alpha/beta/rc tags +release = '0.0.11' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['m2r2', 'sphinx.ext.napoleon', 'sphinx.ext.autodoc'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +root_doc = 'index' + +html_extra_path = [] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..ad48341 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,84 @@ +.. graphgrove documentation master file, created by + sphinx-quickstart on Mon Oct 4 14:13:47 2021. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to graphgrove's documentation! +====================================== + +A framework for building (and incrementally growing) graph-based data structures used in hierarchical or DAG-structured clustering and nearest neighbor search. + +Links +===== + +`Github Project `_ + +`Project Home <../index.html>`_ + +Package Classes +=============== + +.. autoclass:: graphgrove.covertree.Node + :members: + +.. autoclass:: graphgrove.covertree.NNS_L2 + :members: + +.. autoclass:: graphgrove.covertree.MIPS + :members: + +.. autoclass:: graphgrove.covertree.MCSS + :members: + +.. autoclass:: graphgrove.graph_builder.Index + :members: + +.. autoclass:: graphgrove.graph_builder.Cosine_CoverTree + :members: + +.. autoclass:: graphgrove.graph_builder.Cosine_SGTree + :members: + +.. autoclass:: graphgrove.graph_builder.Cosine_SGTreeBeam + :members: + +.. autoclass:: graphgrove.graph_builder.Cosine_FaissFlat + :members: + +.. autoclass:: graphgrove.graph_builder.Cosine_FaissHNSW + :members: + +.. autoclass:: graphgrove.llama.LLAMA + :members: + +.. autoclass:: graphgrove.scc.Node + :members: + +.. autoclass:: graphgrove.scc.Level + :members: + +.. autoclass:: graphgrove.scc.SCC + :members: + +.. autoclass:: graphgrove.sgtree.Node + :members: + +.. autoclass:: graphgrove.sgtree.NNS_L2 + :members: + +.. autoclass:: graphgrove.sgtree.MIPS + :members: + +.. autoclass:: graphgrove.sgtree.MCSS + :members: + +.. autoclass:: graphgrove.vec_scc.Cosine_SCC + :members: + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/graphgrove/llama.py b/graphgrove/llama.py index 5df7b1d..e5dfdfc 100644 --- a/graphgrove/llama.py +++ b/graphgrove/llama.py @@ -9,7 +9,8 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,17 +28,17 @@ def __init__(self, this): def __del__(self): llamac.delete(self.this) - def cluster(self): + def cluster(self): """Run the DAG-clustering process.""" llamac.cluster(self.this) def assignments(self): """Return clusters of the DAG-structure discovered. - + Returns: - coo_matrix of size N by K where N is + coo_matrix of size N by K where N is the number of data points and K is the number - of nodes in the DAG. coo_matrix[i,j] = 1 if + of nodes in the DAG. coo_matrix[i,j] = 1 if the point i is a descendant of the node j (i.e., the cluster represented by node j contains point i). """ @@ -45,7 +46,7 @@ def assignments(self): def structure(self): """Return edges of the DAG-structure discovered. - + Returns: a numpy matrix of size M by 2 where M is the number of edges. each row of the has a value of [child_node_id, parent_node_id] @@ -54,35 +55,33 @@ def structure(self): def round(self, r): """Return the cover of the r^{th} round. - + Returns: - coo_matrix of size N by K_r where N is + coo_matrix of size N by K_r where N is the number of data points and K_r is the number - of nodes in r^{th} round. coo_matrix[i,j] = 1 if + of nodes in r^{th} round. coo_matrix[i,j] = 1 if the point i is a descendant of the node j (i.e., the cluster represented by node j contains point i). """ return llamac.get_round(self.this, r) @classmethod - def from_graph(cls, coo_graph, - num_rounds, cores=4, linkage=2, - max_num_parents=5, max_num_neighbors=100, + def from_graph(cls, coo_graph, + num_rounds, cores=4, linkage=2, + max_num_parents=5, max_num_neighbors=100, thresholds=None, lowest_value=-10000): """Instantiate a LLAMA object with the given graph & hyperparameters. - Arguments: - coo_graph -- the graph to cluster. - num_rounds -- the number of rounds to use. - - Keyword arguments: - cores -- number of parallel threads to use (default 4). - linkage -- linkage function to use either integer (0 for single, 1 for average, 2 for approx. average). (default 2). - or string valued ('single', 'average, 'approx_average') - max_num_parents -- maximum number of parents any node can have (default 5). - max_num_neighbors -- maximum number of neigbhors any node can have in the graph (default 100). - thresholds -- None (for no threshold use). Or a numpy array (float32) of the minimum similarity to allow in an agglomeration (default None). - lowest_value -- value used for missing / minimum similarity (default -10000) + Args: + coo_graph: the graph to cluster. + num_rounds: the number of rounds to use. + cores: number of parallel threads to use (default 4). + linkage: linkage function to use either integer (0 for single, 1 for average, 2 for approx. average). (default 2). + or string valued ('single', 'average, 'approx_average') + max_num_parents: maximum number of parents any node can have (default 5). + max_num_neighbors: maximum number of neigbhors any node can have in the graph (default 100). + thresholds: None (for no threshold use). Or a numpy array (float32) of the minimum similarity to allow in an agglomeration (default None). + lowest_value: value used for missing / minimum similarity (default -10000) """ rows, cols, sims = coo_graph.row.astype(np.uint32), coo_graph.col.astype(np.uint32), coo_graph.data.astype(np.float32) if len(rows.shape) == 1: @@ -91,7 +90,7 @@ def from_graph(cls, coo_graph, cols = cols[:, None] if len(sims.shape) == 1: sims = sims[:, None] - + if thresholds is None: thresholds = np.ones(num_rounds, dtype=np.float32) * lowest_value if type(linkage) == str: diff --git a/homepage/Gemfile b/homepage/Gemfile new file mode 100644 index 0000000..c18a827 --- /dev/null +++ b/homepage/Gemfile @@ -0,0 +1,34 @@ +source "https://rubygems.org" +# Hello! This is where you manage which Jekyll version is used to run. +# When you want to use a different version, change it below, save the +# file and run `bundle install`. Run Jekyll with `bundle exec`, like so: +# +# bundle exec jekyll serve +# +# This will help ensure the proper Jekyll version is running. +# Happy Jekylling! +gem "jekyll", "~> 4.2.1" +# This is the default theme for new Jekyll sites. You may change this to anything you like. +gem "minima", "~> 2.5" +# If you want to use GitHub Pages, remove the "gem "jekyll"" above and +# uncomment the line below. To upgrade, run `bundle update github-pages`. +# gem "github-pages", group: :jekyll_plugins +# If you have any plugins, put them here! +group :jekyll_plugins do + gem "jekyll-feed", "~> 0.12" +end + +gem "jekyll-assets", group: :jekyll_plugins +gem "jekyll-remote-theme", group: :jekyll_plugins + +# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem +# and associated library. +platforms :mingw, :x64_mingw, :mswin, :jruby do + gem "tzinfo", "~> 1.2" + gem "tzinfo-data" +end + +# Performance-booster for watching directories on Windows +gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] + +gem "webrick", "~> 1.7" diff --git a/homepage/Gemfile.lock b/homepage/Gemfile.lock new file mode 100644 index 0000000..cd63d44 --- /dev/null +++ b/homepage/Gemfile.lock @@ -0,0 +1,120 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.8.0) + public_suffix (>= 2.0.2, < 5.0) + colorator (1.1.0) + concurrent-ruby (1.1.9) + em-websocket (0.5.2) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0.6.0) + eventmachine (1.2.7) + fastimage (1.8.1) + addressable (~> 2.3, >= 2.3.5) + ffi (1.15.4) + forwardable-extended (2.6.0) + hike (1.2.3) + http_parser.rb (0.6.0) + i18n (1.8.10) + concurrent-ruby (~> 1.0) + jekyll (4.2.1) + addressable (~> 2.4) + colorator (~> 1.0) + em-websocket (~> 0.5) + i18n (~> 1.0) + jekyll-sass-converter (~> 2.0) + jekyll-watch (~> 2.0) + kramdown (~> 2.3) + kramdown-parser-gfm (~> 1.0) + liquid (~> 4.0) + mercenary (~> 0.4.0) + pathutil (~> 0.9) + rouge (~> 3.0) + safe_yaml (~> 1.0) + terminal-table (~> 2.0) + jekyll-assets (1.0.0) + fastimage (~> 1.6) + jekyll (>= 2) + mini_magick (~> 4.1) + sass (~> 3.2) + sprockets (~> 2.10) + sprockets-helpers + sprockets-sass + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) + jekyll-remote-theme (0.4.3) + addressable (~> 2.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) + jekyll-sass-converter (2.1.0) + sassc (> 2.0.1, < 3.0) + jekyll-seo-tag (2.7.1) + jekyll (>= 3.8, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + kramdown (2.3.1) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.7.0) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + mercenary (0.4.0) + mini_magick (4.11.0) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) + jekyll-feed (~> 0.9) + jekyll-seo-tag (~> 2.1) + multi_json (1.15.0) + pathutil (0.16.2) + forwardable-extended (~> 2.6) + public_suffix (4.0.6) + rack (1.6.13) + rb-fsevent (0.11.0) + rb-inotify (0.10.1) + ffi (~> 1.0) + rexml (3.2.5) + rouge (3.26.1) + rubyzip (2.3.2) + safe_yaml (1.0.5) + sass (3.7.4) + sass-listen (~> 4.0.0) + sass-listen (4.0.0) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + sassc (2.4.0) + ffi (~> 1.9) + sprockets (2.12.5) + hike (~> 1.2) + multi_json (~> 1.0) + rack (~> 1.0) + tilt (~> 1.1, != 1.3.0) + sprockets-helpers (1.4.0) + sprockets (>= 2.2) + sprockets-sass (1.3.1) + sprockets (~> 2.0) + tilt (~> 1.1) + terminal-table (2.0.0) + unicode-display_width (~> 1.1, >= 1.1.1) + tilt (1.4.1) + unicode-display_width (1.8.0) + webrick (1.7.0) + +PLATFORMS + x86_64-darwin-20 + +DEPENDENCIES + jekyll (~> 4.2.1) + jekyll-assets + jekyll-feed (~> 0.12) + jekyll-remote-theme + minima (~> 2.5) + tzinfo (~> 1.2) + tzinfo-data + wdm (~> 0.1.1) + webrick (~> 1.7) + +BUNDLED WITH + 2.2.28 diff --git a/homepage/_config.yml b/homepage/_config.yml new file mode 100644 index 0000000..d2be946 --- /dev/null +++ b/homepage/_config.yml @@ -0,0 +1,65 @@ +# Welcome to Jekyll! +# +# This config file is meant for settings that affect your whole blog, values +# which you are expected to set up once and rarely edit after that. If you find +# yourself editing this file very often, consider using Jekyll's data files +# feature for the data you need to update frequently. +# +# For technical reasons, this file is *NOT* reloaded automatically when you use +# 'bundle exec jekyll serve'. If you change this file, please restart the server process. +# +# If you need help with YAML syntax, here are some quick references for you: +# https://learn-the-web.algonquindesign.ca/topics/markdown-yaml-cheat-sheet/#yaml +# https://learnxinyminutes.com/docs/yaml/ +# +# Site settings +# These are used to personalize your new site. If you look in the HTML files, +# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on. +# You can create any custom variable you would like, and they will be accessible +# in the templates via {{ site.myvariable }}. + +title: graphgrove +email: nmonath@cs.umass.edu +description: >- # this means to ignore newlines until "baseurl:" + A framework for building (and incrementally growing) graph-based data structures used in hierarchical or DAG-structured clustering and nearest neighbor search. +baseurl: "/graphgrove" # the subpath of your site, e.g. /blog +url: "" # the base hostname & protocol for your site, e.g. http://example.com +github_username: nmonath + +# Build settings +remote_theme: pages-themes/minimal@v0.2.0 +plugins: + - jekyll-feed + - jekyll-remote-theme + +# Exclude from processing. +# The following items will not be processed, by default. +# Any item listed under the `exclude:` key here will be automatically added to +# the internal "default list". +# +# Excluded items can be processed by explicitly listing the directories or +# their entries' file path in the `include:` list. +# +# exclude: +# - .sass-cache/ +# - .jekyll-cache/ +# - gemfiles/ +# - Gemfile +# - Gemfile.lock +# - node_modules/ +# - vendor/bundle/ +# - vendor/cache/ +# - vendor/gems/ +# - vendor/ruby/ + +assets: + compression: true + sources: + - _assets + +github: + is_project_page: true + repository_url: https://github.com/nmonath/graphgrove + repository_nwo: nmonath/graphgrove + owner_url: https://github.com/nmonath + owner_name: "@nmonath" diff --git a/homepage/_includes/README.md b/homepage/_includes/README.md new file mode 100644 index 0000000..e23add0 --- /dev/null +++ b/homepage/_includes/README.md @@ -0,0 +1,78 @@ +
+ +
+ +## Install + +Linux wheels available (python >=3.6) on [pypi](https://pypi.org/project/graphgrove/): + +``` +pip install graphgrove +``` + +Building from source: + +``` +conda create -n gg python=3.8 +conda activate gg +pip install numpy +make +``` + +To build your own wheel: + +``` +conda create -n gg python=3.8 +conda activate gg +pip install numpy +make +pip install build +python -m build --wheel +# which can be used as: +# pip install --force dist/graphgrove-0.0.1-cp37-cp37m-linux_x86_64.whl +``` + +## Examples + +Toy examples of [clustering](https://github.com/nmonath/graphgrove/examples/clustering.py), [DAG-structured clustering](https://github.com/nmonath/graphgrove/examples/dag_clustering.py), and [nearest neighbor search](https://github.com/nmonath/graphgrove/examples/nearest_neighbor_search.py) are available. + +At a high level, incremental clustering can be done as: + +```Python +import graphgrove as gg +k = 5 +num_rounds = 50 +thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32) +scc = gg.vec_scc.Cosine_SCC(k=k, num_rounds=num_rounds, thresholds=thresholds, index_name='cosine_sgtree', cores=cores, verbosity=0) +# data_batches - generator of numpy matrices mini-batch-size by dim +for batch in data_batches: + scc.partial_fit(batch) +``` + +Incremental nearest neighbor search can be done as: +```Python +import graphgrove as gg +k=5 +cores=4 +tree = gg.graph_builder.Cosine_SGTree(k=k, cores=cores) +# data_batches - generator of numpy matrices mini-batch-size by dim +for batch in data_batches: + tree.insert(batch) # or tree.insert_and_knn(batch) +``` + +## Algorithms Implemented + +Clustering: +* Sub-Cluster Component Algorithm (SCC) and its minibatch variant from the paper: [Scalable Hierarchical Agglomerative Clustering](https://dl.acm.org/doi/10.1145/3447548.3467404). Nicholas, Monath, Kumar Avinava Dubey, Guru Guruganesh, Manzil Zaheer, Amr Ahmed, Andrew McCallum, Gokhan Mergen, Marc Najork Mert Terzihan Bryon Tjanaka Yuan Wang Yuchen Wu. KDD. 2021 +* DAG Structured clustering (LLama) from [DAG-Structured Clustering by Nearest Neighbors](https://proceedings.mlr.press/v130/monath21a). Nicholas Monath, Manzil Zaheer, Kumar Avinava Dubey, Amr Ahmed, Andrew McCallum. AISTATS 2021. + + +Nearest Neighbor Search: +* CoverTree: Alina Beygelzimer, Sham Kakade, and John Langford. "Cover trees for nearest neighbor." ICML. 2006. +* SGTree: SG-Tree is a new data structure for exact nearest neighbor search inspired from Cover Tree and its improvement, which has been used in the TerraPattern project. At a high level, SG-Tree tries to create a hierarchical tree where each node performs a "coarse" clustering. The centers of these "clusters" become the children and subsequent insertions are recursively performed on these children. When performing the NN query, we prune out solutions based on a subset of the dimensions that are being queried. This is particularly useful when trying to find the nearest neighbor in highly clustered subset of the data, e.g. when the data comes from a recursive mixture of Gaussians or more generally time marginalized coalscent process . The effect of these two optimizations is that our data structure is extremely simple, highly parallelizable and is comparable in performance to existing NN implementations on many data-sets. Manzil Zaheer, Guru Guruganesh, Golan Levin, Alexander Smola. [TerraPattern: A Nearest Neighbor Search Service](http://manzil.ml/res/Papers/2019_sgtree.pdf). 2019. + +## Credits + +Special thanks to the following contributors: + +- Andrew Drozdov ([@mrdrozdov](https://github.com/mrdrozdov)) diff --git a/homepage/_layouts/default.html b/homepage/_layouts/default.html new file mode 100644 index 0000000..79d863f --- /dev/null +++ b/homepage/_layouts/default.html @@ -0,0 +1,58 @@ + + + + + + + +{% seo %} + + + {% include head-custom.html %} + + +
+
+

{{ site.title | default: site.github.repository_name }}

+ + {% if site.logo %} + Logo + {% endif %} + +

{{ site.description | default: site.github.project_tagline }}

+ + {% if site.github.is_project_page %} +

View the Project on GitHub {{ site.github.repository_nwo }}

+ {% endif %} + + {% if site.github.is_user_page %} +

View My GitHub Profile

+ {% endif %} + + {% if site.show_downloads %} + + {% endif %} + +

View Additional Documentation

+
+
+ + {{ content }} + +
+ +
+ + + \ No newline at end of file diff --git a/homepage/index.markdown b/homepage/index.markdown new file mode 100644 index 0000000..ee6a365 --- /dev/null +++ b/homepage/index.markdown @@ -0,0 +1,5 @@ +--- +layout: default +--- + +{% include README.md %}