From 6a8a7ae05e4144e1f679b868476906ee4e23ed98 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Wed, 8 Jul 2020 19:59:52 -0600 Subject: [PATCH 01/15] Source links working for python code --- docs/source/_static/example_mod.js | 52 ++++++++++++++ docs/source/conf.py | 33 +++++++-- docs/source/sphinxext/github_link.py | 101 +++++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 4 deletions(-) create mode 100644 docs/source/_static/example_mod.js create mode 100644 docs/source/sphinxext/github_link.py diff --git a/docs/source/_static/example_mod.js b/docs/source/_static/example_mod.js new file mode 100644 index 0000000000..c8f620048d --- /dev/null +++ b/docs/source/_static/example_mod.js @@ -0,0 +1,52 @@ +$(document).ready(function () { + /* Add a [>>>] button on the top-right corner of code samples to hide + * the >>> and ... prompts and the output and thus make the code + * copyable. */ + var div = $('.highlight-python .highlight,' + + '.highlight-python3 .highlight,' + + '.highlight-pycon .highlight,' + + '.highlight-default .highlight') + var pre = div.find('pre'); + + // get the styles from the current theme + pre.parent().parent().css('position', 'relative'); + var hide_text = 'Hide prompts and outputs'; + var show_text = 'Show prompts and outputs'; + + // create and add the button to all the code blocks that contain >>> + div.each(function (index) { + var jthis = $(this); + if (jthis.find('.gp').length > 0) { + var button = $('>>>'); + button.attr('title', hide_text); + button.data('hidden', 'false'); + jthis.prepend(button); + } + // tracebacks (.gt) contain bare text elements that need to be + // wrapped in a span to work with .nextUntil() (see later) + jthis.find('pre:has(.gt)').contents().filter(function () { + return ((this.nodeType == 3) && (this.data.trim().length > 0)); + }).wrap(''); + }); + + // define the behavior of the button when it's clicked + $('.copybutton').click(function (e) { + e.preventDefault(); + var button = $(this); + if (button.data('hidden') === 'false') { + // hide the code output + button.parent().find('.go, .gp, .gt').hide(); + button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); + button.css('text-decoration', 'line-through'); + button.attr('title', show_text); + button.data('hidden', 'true'); + } else { + // show the code output + button.parent().find('.go, .gp, .gt').show(); + button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); + button.css('text-decoration', 'none'); + button.attr('title', hide_text); + button.data('hidden', 'false'); + } + }); +}); \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 6dc84377bb..dc32b87c88 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,6 +20,14 @@ import sys sys.path.insert(0, os.path.abspath('../../python')) +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory +# is relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +sys.path.insert(0, os.path.abspath('./sphinxext')) + +from github_link import make_linkcode_resolve + # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -30,15 +38,17 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.intersphinx', + 'numpydoc', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', - 'numpydoc', - "sphinx_markdown_tables", + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.linkcode', "IPython.sphinxext.ipython_console_highlighting", "IPython.sphinxext.ipython_directive", "nbsphinx", "recommonmark", + "sphinx_markdown_tables", ] ipython_mplbackend = "str" @@ -46,11 +56,14 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] +# generate autosummary even if no references +# autosummary_generate = True + # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = {".rst": "restructuredtext", ".md": "markdown"} +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} # The master toctree document. master_doc = 'index' @@ -120,6 +133,10 @@ html_static_path = ['_static'] +html_js_files = [ + "example_mod.js" +] + # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. @@ -188,3 +205,11 @@ def setup(app): app.add_css_file('params.css') + app.add_css_file('copybutton.css') + +# The following is used by sphinx.ext.linkcode to provide links to github +linkcode_resolve = make_linkcode_resolve('cuml', + 'https://github.com/rapidsai/' + 'cuml/blob/{revision}/python/' + '{package}/{path}#L{lineno}') + diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py new file mode 100644 index 0000000000..98eed65ae3 --- /dev/null +++ b/docs/source/sphinxext/github_link.py @@ -0,0 +1,101 @@ +from operator import attrgetter +import inspect +import subprocess +import os +import sys +from functools import partial + +# orig = inspect.isfunction + +# # See https://opendreamkit.org/2017/06/09/CythonSphinx/ +# def isfunction(obj): + +# orig_val = orig(obj) + +# new_val = hasattr(type(obj), "__code__") + +# if (orig_val != new_val): +# pass + +# return orig_val + +# inspect.isfunction = isfunction + +REVISION_CMD = 'git rev-parse --short HEAD' + + +def _get_git_revision(): + try: + revision = subprocess.check_output(REVISION_CMD.split()).strip() + except (subprocess.CalledProcessError, OSError): + print('Failed to execute git to get revision') + return None + return revision.decode('utf-8') + + +def _linkcode_resolve(domain, info, package, url_fmt, revision): + """Determine a link to online source for a class/method/function + + This is called by sphinx.ext.linkcode + + An example with a long-untouched module that everyone has + >>> _linkcode_resolve('py', {'module': 'tty', + ... 'fullname': 'setraw'}, + ... package='tty', + ... url_fmt='http://hg.python.org/cpython/file/' + ... '{revision}/Lib/{package}/{path}#L{lineno}', + ... revision='xxxx') + 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' + """ + + if revision is None: + return + if domain not in ('py', 'pyx'): + return + if not info.get('module') or not info.get('fullname'): + return + + class_name = info['fullname'].split('.')[0] + module = __import__(info['module'], fromlist=[class_name]) + obj = attrgetter(info['fullname'])(module) + + # Unwrap the object to get the correct source + # file in case that is wrapped by a decorator + obj = inspect.unwrap(obj) + + try: + fn = inspect.getsourcefile(obj) + except Exception: + fn = None + if not fn: + try: + fn = inspect.getsourcefile(sys.modules[obj.__module__]) + except Exception: + fn = None + if not fn: + return + + fn = os.path.relpath(fn, + start=os.path.dirname(__import__(package).__file__)) + try: + lineno = inspect.getsourcelines(obj)[1] + except Exception: + lineno = '' + return url_fmt.format(revision=revision, package=package, + path=fn, lineno=lineno) + + +def make_linkcode_resolve(package, url_fmt): + """Returns a linkcode_resolve function for the given URL format + + revision is a git commit reference (hash or name) + + package is the name of the root module of the package + + url_fmt is along the lines of ('https://github.com/USER/PROJECT/' + 'blob/{revision}/{package}/' + '{path}#L{lineno}') + """ + revision = _get_git_revision() + return partial(_linkcode_resolve, revision=revision, package=package, + url_fmt=url_fmt) \ No newline at end of file From aab10286a883ac6f28cc86939345dc584449add7 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Wed, 8 Jul 2020 20:01:08 -0600 Subject: [PATCH 02/15] Missed copybutton.css --- docs/source/_static/copybutton.css | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 docs/source/_static/copybutton.css diff --git a/docs/source/_static/copybutton.css b/docs/source/_static/copybutton.css new file mode 100644 index 0000000000..31578bc0a8 --- /dev/null +++ b/docs/source/_static/copybutton.css @@ -0,0 +1,38 @@ +/* copybutton */ +/* Adds "Show/Hide Output" button to Examples */ + +.copybutton { + cursor: pointer; + position: absolute; + top: 0px; + right: 0px; + border: 1px solid rgb(221, 221, 221); + color: rgb(221, 221, 221); + font-family: monospace; + padding-left: 0.2rem; + padding-right: 0.2rem; +} + +div.highlight:hover span.copybutton::after { + background: #3F556B; + border-radius: 0.25rem; + color: white; + content: attr(title); + padding: 0.25rem; + position: absolute; + z-index: 98; + width: 100px; + font-size: 0.7rem; + top: 0; + right: 0; +} + +/* copy buttonn */ +div.highlight:hover span.copybutton { + background-color: #3F556B; + color: white; +} + +div.highlight:hover span.copybutton:hover { + background-color: #20252B; +} \ No newline at end of file From 4f12ea682e89822d400ad76b18dc489176e03110 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Thu, 9 Jul 2020 17:45:22 -0600 Subject: [PATCH 03/15] Bad commit. Do not use --- docs/source/sphinxext/github_link.py | 56 ++++-- python/cuml_build_ext.py | 111 +++++++++++ python/setup copy.py | 228 ++++++++++++++++++++++ python/setup_new.py | 272 +++++++++++++++++++++++++++ 4 files changed, 650 insertions(+), 17 deletions(-) create mode 100644 python/cuml_build_ext.py create mode 100644 python/setup copy.py create mode 100644 python/setup_new.py diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py index 98eed65ae3..2221d46be2 100644 --- a/docs/source/sphinxext/github_link.py +++ b/docs/source/sphinxext/github_link.py @@ -4,25 +4,28 @@ import os import sys from functools import partial +import re +import typing -# orig = inspect.isfunction +orig = inspect.isfunction -# # See https://opendreamkit.org/2017/06/09/CythonSphinx/ -# def isfunction(obj): +# See https://opendreamkit.org/2017/06/09/CythonSphinx/ +def isfunction(obj): -# orig_val = orig(obj) + orig_val = orig(obj) -# new_val = hasattr(type(obj), "__code__") + new_val = hasattr(type(obj), "__code__") -# if (orig_val != new_val): -# pass + if (orig_val != new_val): + return new_val -# return orig_val + return orig_val -# inspect.isfunction = isfunction +inspect.isfunction = isfunction REVISION_CMD = 'git rev-parse --short HEAD' +source_regex = re.compile(r"^File: (.*?) \(starting at line ([0-9]*?)\)$", re.MULTILINE) def _get_git_revision(): try: @@ -63,6 +66,9 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision): # file in case that is wrapped by a decorator obj = inspect.unwrap(obj) + fn: str = None + lineno: str = None + try: fn = inspect.getsourcefile(obj) except Exception: @@ -72,15 +78,31 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision): fn = inspect.getsourcefile(sys.modules[obj.__module__]) except Exception: fn = None - if not fn: - return - fn = os.path.relpath(fn, - start=os.path.dirname(__import__(package).__file__)) - try: - lineno = inspect.getsourcelines(obj)[1] - except Exception: - lineno = '' + if not fn: + # Possibly Cython code. Search docstring for source + m = source_regex.search(obj.__doc__) + + if (m is not None): + source_file = m.group(1) + lineno = m.group(2) + + # fn is expected to be the absolute path. + fn = os.path.relpath(source_file, start=package) + print("{}:{}".format(os.path.abspath(os.path.join("..", "python", "cuml", fn)), lineno)) + else: + return + else: + # Convert to relative from module root + fn = os.path.relpath(fn, + start=os.path.dirname(__import__(package).__file__)) + + # Get the line number if we need it. (Can work without it) + if (lineno is None): + try: + lineno = inspect.getsourcelines(obj)[1] + except Exception: + lineno = '' return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) diff --git a/python/cuml_build_ext.py b/python/cuml_build_ext.py new file mode 100644 index 0000000000..25e3a3a608 --- /dev/null +++ b/python/cuml_build_ext.py @@ -0,0 +1,111 @@ +import sys + +if 'setuptools' in sys.modules: + try: + from Cython.Distutils.build_ext import new_build_ext as _build_ext + except ImportError: + from setuptools.command.build_ext import build_ext as _build_ext +else: + from distutils.command.build_ext import build_ext as _build_ext + + +import setuptools.command.build_ext + +class new_build_ext(_build_ext, object): + user_options = [ + ( + 'language-level=', + None, + 'Sets the python language syntax to use "2", "3", "3str".' + ), + ( + "binding=", + None, + "Sets the binding Cython binding directive" + ), + ( + "profile=", + None, + "Sets the profile Cython binding directive" + ), + ( + "embedsignature=", + None, + "Sets the binding Cython binding directive" + ), + ( + "cython-exclude=", + None, + "Sets the binding Cython binding directive" + ) + ] + _build_ext.user_options + + boolean_options = [ + "binding", "profile", "embedsignature" + ] + _build_ext.boolean_options + + def initialize_options(self): + + print("cuml_build_ext::initialize_options") + + self.language_level = None + self.binding = None + self.profile = None + self.embedsignature = None + self.cython_exclude = None + super(new_build_ext, self).initialize_options() + + def finalize_options(self): + + print("cuml_build_ext::finalize_options") + + self.set_undefined_options('build', + ('build_lib', 'build_lib'), + ('build_temp', 'build_temp'), + ('compiler', 'compiler'), + ('debug', 'debug'), + ('force', 'force'), + ('parallel', 'parallel'), + ('plat_name', 'plat_name'), + ) + + if self.distribution.ext_modules: + if self.language_level is None: + self.language_level = str(sys.version_info[0]) + + assert self.language_level in ('2', '3', '3str'), 'Incorrect Cython language level ("{0}")'.format(self.language_level) + + compiler_directives = dict(language_level=self.language_level) + + if (self.binding is not None): + compiler_directives.update({ "binding": bool(self.binding) }) + + if (self.profile is not None): + compiler_directives.update({ "profile": bool(self.profile) }) + + if (self.embedsignature is not None): + compiler_directives.update({ "embedsignature": bool(self.embedsignature) }) + + cythonize_kwargs = { + } + + if (self.cython_exclude is not None): + + if (type(self.cython_exclude) == str): + self.cython_exclude = list(self.cython_exclude) + + cythonize_kwargs.update({ "exclude": self.cython_exclude }) + + nthreads = getattr(self, 'parallel', None) # -j option in Py3.5+ + nthreads = int(nthreads) if nthreads else None + + from Cython.Build.Dependencies import cythonize + + self.distribution.ext_modules[:] = cythonize( + self.distribution.ext_modules, + nthreads=nthreads, + force=self.force, + compiler_directives=compiler_directives, + **cythonize_kwargs + ) + setuptools.command.build_ext.build_ext.finalize_options(self) diff --git a/python/setup copy.py b/python/setup copy.py new file mode 100644 index 0000000000..e72d43d875 --- /dev/null +++ b/python/setup copy.py @@ -0,0 +1,228 @@ +# +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from distutils.sysconfig import get_python_lib +from pathlib import Path +from pprint import pprint +from setuptools import find_packages +from setuptools import setup +from setuptools.extension import Extension +from setuputils import clean_folder +from setuputils import get_environment_option +from setuputils import get_cli_option +from setuputils import use_raft_package + +import glob +import numpy +import os +import shutil +import sys +import sysconfig +import versioneer +import warnings + + +if "--singlegpu" in sys.argv: + from Cython.Build import cythonize + from setuptools.command.build_ext import build_ext +else: + try: + from Cython.Distutils.build_ext import new_build_ext as build_ext + except ImportError: + from setuptools.command.build_ext import build_ext + +install_requires = [ + 'numba', + 'cython' +] + +############################################################################## +# - Print of build options used by setup.py -------------------------------- + +cuda_home = get_environment_option("CUDA_HOME") +libcuml_path = get_environment_option('CUML_BUILD_PATH') +raft_path = get_environment_option('RAFT_PATH') + +clean_artifacts = get_cli_option('clean') +single_gpu_build = get_cli_option('--singlegpu') + +############################################################################## +# - Dependencies include and lib folder setup -------------------------------- + +if not cuda_home: + nvcc_path = shutil.which('nvcc') + if (not nvcc_path): + raise FileNotFoundError("nvcc not found.") + + cuda_home = str(Path(nvcc_path).parent.parent) + print("-- Using nvcc to detect CUDA, found at " + str(cuda_home)) +cuda_include_dir = os.path.join(cuda_home, "include") +cuda_lib_dir = os.path.join(cuda_home, "lib64") + +############################################################################## +# - Clean target ------------------------------------------------------------- + +if clean_artifacts: + print("-- Cleaning all Python and Cython build artifacts...") + + treelite_path = "" + libcuml_path = "" + + try: + setup_file_path = str(Path(__file__).parent.absolute()) + shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True) + shutil.rmtree(setup_file_path + '/_external_repositories', + ignore_errors=True) + shutil.rmtree(setup_file_path + '/cuml.egg-info', ignore_errors=True) + shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True) + + os.remove(setup_file_path + '/cuml/raft') + + clean_folder(setup_file_path + '/cuml') + shutil.rmtree(setup_file_path + '/build') + + except IOError: + pass + + # need to terminate script so cythonizing doesn't get triggered after + # cleanup unintendedly + sys.argv.remove("clean") + + if "--all" in sys.argv: + sys.argv.remove("--all") + + if len(sys.argv) == 1: + sys.exit(0) + +############################################################################## +# - Cloning RAFT and dependencies if needed ---------------------------------- + +# Use RAFT repository in cuml.raft + +raft_include_dir = use_raft_package(raft_path, libcuml_path) + +############################################################################## +# - Cython extensions build and parameters ----------------------------------- + +# cumlcomms and nccl are still needed for multigpu algos not based +# on libcumlprims +libs = ['cuda', + 'cuml++', + 'rmm'] + +include_dirs = ['../cpp/src', + '../cpp/include', + '../cpp/src_prims', + raft_include_dir, + '../cpp/comms/std/src', + '../cpp/comms/std/include', + cuda_include_dir, + numpy.get_include(), + os.path.dirname(sysconfig.get_path("include"))] + +# Exclude multigpu components that use libcumlprims if --singlegpu is used +cython_exc_list = [] +python_exc_list = [] + +if "--multigpu" in sys.argv: + warnings.warn("Flag --multigpu is deprecated. By default cuML is" + "built with multi GPU support. To disable it use the flag" + "--singlegpu") + sys.argv.remove('--multigpu') + +if "--singlegpu" in sys.argv: + cython_exc_list = glob.glob('cuml/*/*_mg.pyx') + cython_exc_list = cython_exc_list + glob.glob('cuml/*/*_mg.pxd') + cython_exc_list.append('cuml/nccl/nccl.pyx') + cython_exc_list.append('cuml/dask/common/comms_utils.pyx') + + print('--singlegpu: excluding the following Cython components:') + pprint(cython_exc_list) + + python_exc_list = ["*.dask", "*.dask.*"] +else: + libs.append('cumlprims') + libs.append('cumlcomms') + libs.append('nccl') + + sys_include = os.path.dirname(sysconfig.get_path("include")) + include_dirs.append("%s/cumlprims" % sys_include) + +cmdclass = dict() +cmdclass.update(versioneer.get_cmdclass()) +cmdclass["build_ext"] = build_ext + +if not libcuml_path: + libcuml_path = '../cpp/build/' + +# from Cython.Compiler import Options + +# Options.embed_pos_in_docstring = True + +extensions = [ + Extension("*", + sources=["cuml/**/**/*.pyx"], + include_dirs=include_dirs, + library_dirs=[get_python_lib(), libcuml_path], + runtime_library_dirs=[cuda_lib_dir, + os.path.join(os.sys.prefix, "lib")], + libraries=libs, + language='c++', + extra_compile_args=['-std=c++11'], + define_macros=[("CYTHON_TRACE=1",)]) +] + +for e in extensions: + # TODO: this exclude is not working, need to research way to properly + # exclude files for parallel build. See issue + # https://github.com/rapidsai/cuml/issues/2037 + # e.exclude = cython_exc_list + e.cython_directives = dict( + profile=False, language_level=3, embedsignature=True, linetrace=True, binding=True + ) + +if "--singlegpu" in sys.argv: + print("Full cythonization in parallel is not supported for singlegpu " + + "target for now.") + directives = dict( + profile=False, language_level=3, embedsignature=True, linetrace=True, binding=True + ) + extensions = cythonize(extensions, + exclude=cython_exc_list, nthreads=6, compiler_directives=directives) + sys.argv.remove('--singlegpu') + +############################################################################## +# - Python package generation ------------------------------------------------ + +setup(name='cuml', + description="cuML - RAPIDS ML Algorithms", + version=versioneer.get_version(), + classifiers=[ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7" + ], + author="NVIDIA Corporation", + setup_requires=['cython'], + ext_modules=extensions, + packages=find_packages(include=['cuml', 'cuml.*'], + exclude=python_exc_list), + install_requires=install_requires, + license="Apache", + cmdclass=cmdclass, + zip_safe=False + ) diff --git a/python/setup_new.py b/python/setup_new.py new file mode 100644 index 0000000000..f37fc95e75 --- /dev/null +++ b/python/setup_new.py @@ -0,0 +1,272 @@ +# +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from distutils.sysconfig import get_python_lib +from pathlib import Path +from pprint import pprint +from setuptools import find_packages +from setuptools import setup +from setuptools.extension import Extension +from setuputils import clean_folder +from setuputils import get_environment_option +from setuputils import get_cli_option +from setuputils import use_raft_package +from distutils.command.build import build as _build +# from cuml_build_ext import new_build_ext + +from Cython.Distutils.build_ext import new_build_ext + +import glob +import numpy +import os +import shutil +import sys +import sysconfig +import versioneer +import warnings + +install_requires = [ + 'numba', + 'cython' +] + +############################################################################## +# - Print of build options used by setup.py -------------------------------- + +cuda_home = get_environment_option("CUDA_HOME") +libcuml_path = get_environment_option('CUML_BUILD_PATH') +raft_path = get_environment_option('RAFT_PATH') + +clean_artifacts = get_cli_option('clean') +single_gpu_build = get_cli_option('--singlegpu') + +############################################################################## +# - Dependencies include and lib folder setup -------------------------------- + +if not cuda_home: + nvcc_path = shutil.which('nvcc') + if (not nvcc_path): + raise FileNotFoundError("nvcc not found.") + + cuda_home = str(Path(nvcc_path).parent.parent) + print("-- Using nvcc to detect CUDA, found at " + str(cuda_home)) +cuda_include_dir = os.path.join(cuda_home, "include") +cuda_lib_dir = os.path.join(cuda_home, "lib64") + +############################################################################## +# - Clean target ------------------------------------------------------------- + +if clean_artifacts: + print("-- Cleaning all Python and Cython build artifacts...") + + treelite_path = "" + libcuml_path = "" + + try: + setup_file_path = str(Path(__file__).parent.absolute()) + shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True) + shutil.rmtree(setup_file_path + '/_external_repositories', + ignore_errors=True) + shutil.rmtree(setup_file_path + '/cuml.egg-info', ignore_errors=True) + shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True) + + os.remove(setup_file_path + '/cuml/raft') + + clean_folder(setup_file_path + '/cuml') + shutil.rmtree(setup_file_path + '/build') + + except IOError: + pass + + # need to terminate script so cythonizing doesn't get triggered after + # cleanup unintendedly + sys.argv.remove("clean") + + if "--all" in sys.argv: + sys.argv.remove("--all") + + if len(sys.argv) == 1: + sys.exit(0) + +############################################################################## +# - Cloning RAFT and dependencies if needed ---------------------------------- + +# Use RAFT repository in cuml.raft + +raft_include_dir = use_raft_package(raft_path, libcuml_path) + +############################################################################## +# - Cython extensions build and parameters ----------------------------------- + +# cumlcomms and nccl are still needed for multigpu algos not based +# on libcumlprims +libs = ['cuda', + 'cuml++', + 'rmm'] + +include_dirs = ['../cpp/src', + '../cpp/include', + '../cpp/src_prims', + raft_include_dir, + '../cpp/comms/std/src', + '../cpp/comms/std/include', + cuda_include_dir, + numpy.get_include(), + os.path.dirname(sysconfig.get_path("include"))] + +# Exclude multigpu components that use libcumlprims if --singlegpu is used +cython_exc_list = [] +python_exc_list = [] + +if "--multigpu" in sys.argv: + warnings.warn("Flag --multigpu is deprecated. By default cuML is" + "built with multi GPU support. To disable it use the flag" + "--singlegpu") + sys.argv.remove('--multigpu') + +if "--singlegpu" in sys.argv: + cython_exc_list = glob.glob('cuml/*/*_mg.pyx') + cython_exc_list = cython_exc_list + glob.glob('cuml/*/*_mg.pxd') + cython_exc_list.append('cuml/nccl/nccl.pyx') + cython_exc_list.append('cuml/dask/common/comms_utils.pyx') + + print('--singlegpu: excluding the following Cython components:') + pprint(cython_exc_list) + + python_exc_list = ["*.dask", "*.dask.*"] +else: + libs.append('cumlprims') + libs.append('cumlcomms') + libs.append('nccl') + + sys_include = os.path.dirname(sysconfig.get_path("include")) + include_dirs.append("%s/cumlprims" % sys_include) + +cmdclass = dict() +cmdclass.update(versioneer.get_cmdclass()) +cmdclass["build_ext"] = new_build_ext + +if not libcuml_path: + libcuml_path = '../cpp/build/' + +# extensions = [ +# Extension("*", +# sources=["cuml/**/**/*.pyx"], +# include_dirs=include_dirs, +# library_dirs=[get_python_lib(), libcuml_path], +# runtime_library_dirs=[cuda_lib_dir, +# os.path.join(os.sys.prefix, "lib")], +# libraries=libs, +# language='c++', +# extra_compile_args=['-std=c++11']) +# ] + +# for e in extensions: +# # TODO: this exclude is not working, need to research way to properly +# # exclude files for parallel build. See issue +# # https://github.com/rapidsai/cuml/issues/2037 +# # e.exclude = cython_exc_list +# e.cython_directives = dict( +# profile=False, language_level=3, embedsignature=True, binding=True +# ) + +# if "--singlegpu" in sys.argv: +# print("Full cythonization in parallel is not supported for singlegpu " + +# "target for now.") +# extensions = cythonize(extensions, +# exclude=cython_exc_list) +# sys.argv.remove('--singlegpu') + +class cuml_build(_build): + + user_options = [ + ( + 'single-gpu=', + None, + 'Determines whether to compile for single gpu or not' + ) + ] + _build.user_options + + boolean_options = [ + "single-gpu" + ] + _build.boolean_options + + def initialize_options(self): + + print("cuml_build::initialize_options") + + self.single_gpu = False + + # import cuml_build_ext + + # self.distribution.cmdclass['build_ext'] = cuml_build_ext.new_build_ext + + # if you wanted to use the Extension class from Cython + # from Cython.Distutils.extension import Extension + # ext = Extension(....) + # self.distribution.ext_modules = [ext] + + extensions = [ + Extension("*", + sources=["cuml/**/*.pyx"], + include_dirs=include_dirs, + library_dirs=[get_python_lib(), libcuml_path], + runtime_library_dirs=[cuda_lib_dir, + os.path.join(os.sys.prefix, "lib")], + libraries=libs, + language='c++', + extra_compile_args=['-std=c++11']) + ] + + self.distribution.ext_modules = extensions + + super().initialize_options() + + def finalize_options(self): + + print("cuml_build::finalize_options") + + if (self.single_gpu): + sub_build_ext = self.distribution.get_command_obj("build_ext") + + sub_build_ext.cython_exc_list = cython_exc_list + + super().finalize_options() + + +cmdclass["build"] = cuml_build + +############################################################################## +# - Python package generation ------------------------------------------------ + +setup(name='cuml', + description="cuML - RAPIDS ML Algorithms", + version=versioneer.get_version(), + classifiers=[ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7" + ], + author="NVIDIA Corporation", + setup_requires=['cython'], + packages=find_packages(include=['cuml', 'cuml.*'], + exclude=python_exc_list), + install_requires=install_requires, + license="Apache", + cmdclass=cmdclass, + zip_safe=False + ) From 6f8ee4c5301dd81de06f22a06c30d39445244ff3 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Fri, 10 Jul 2020 18:28:16 -0600 Subject: [PATCH 04/15] Source links working for cython. Need to test cyfunction for benchmark performance regressions --- docs/source/sphinxext/github_link.py | 44 +++-- python/cuml_build_ext.py | 97 +++++----- python/setup.cfg | 6 + python/setup.py | 159 ++++++++++------ python/setup_new.py | 272 --------------------------- 5 files changed, 179 insertions(+), 399 deletions(-) delete mode 100644 python/setup_new.py diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py index 2221d46be2..9fffee58d4 100644 --- a/docs/source/sphinxext/github_link.py +++ b/docs/source/sphinxext/github_link.py @@ -1,14 +1,14 @@ -from operator import attrgetter import inspect -import subprocess import os +import re +import subprocess import sys from functools import partial -import re -import typing +from operator import attrgetter orig = inspect.isfunction + # See https://opendreamkit.org/2017/06/09/CythonSphinx/ def isfunction(obj): @@ -21,11 +21,14 @@ def isfunction(obj): return orig_val + inspect.isfunction = isfunction REVISION_CMD = 'git rev-parse --short HEAD' -source_regex = re.compile(r"^File: (.*?) \(starting at line ([0-9]*?)\)$", re.MULTILINE) +source_regex = re.compile(r"^File: (.*?) \(starting at line ([0-9]*?)\)$", + re.MULTILINE) + def _get_git_revision(): try: @@ -89,22 +92,37 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision): # fn is expected to be the absolute path. fn = os.path.relpath(source_file, start=package) - print("{}:{}".format(os.path.abspath(os.path.join("..", "python", "cuml", fn)), lineno)) + print("{}:{}".format( + os.path.abspath(os.path.join("..", "python", "cuml", fn)), + lineno)) else: return else: + # Test if we are absolute or not (pyx are relative) + if (not os.path.isabs(fn)): + # Should be relative to docs right now + fn = os.path.abspath(os.path.join("..", "python", fn)) + # Convert to relative from module root fn = os.path.relpath(fn, - start=os.path.dirname(__import__(package).__file__)) + start=os.path.dirname( + __import__(package).__file__)) # Get the line number if we need it. (Can work without it) if (lineno is None): try: lineno = inspect.getsourcelines(obj)[1] except Exception: - lineno = '' - return url_fmt.format(revision=revision, package=package, - path=fn, lineno=lineno) + + # Can happen if its a cyfunction. See if it has `__code__` + if (hasattr(obj, "__code__")): + lineno = obj.__code__.co_firstlineno + else: + lineno = '' + return url_fmt.format(revision=revision, + package=package, + path=fn, + lineno=lineno) def make_linkcode_resolve(package, url_fmt): @@ -119,5 +137,7 @@ def make_linkcode_resolve(package, url_fmt): '{path}#L{lineno}') """ revision = _get_git_revision() - return partial(_linkcode_resolve, revision=revision, package=package, - url_fmt=url_fmt) \ No newline at end of file + return partial(_linkcode_resolve, + revision=revision, + package=package, + url_fmt=url_fmt) diff --git a/python/cuml_build_ext.py b/python/cuml_build_ext.py index 25e3a3a608..ed05e8220c 100644 --- a/python/cuml_build_ext.py +++ b/python/cuml_build_ext.py @@ -8,41 +8,21 @@ else: from distutils.command.build_ext import build_ext as _build_ext - import setuptools.command.build_ext + class new_build_ext(_build_ext, object): user_options = [ - ( - 'language-level=', - None, - 'Sets the python language syntax to use "2", "3", "3str".' - ), - ( - "binding=", - None, - "Sets the binding Cython binding directive" - ), - ( - "profile=", - None, - "Sets the profile Cython binding directive" - ), - ( - "embedsignature=", - None, - "Sets the binding Cython binding directive" - ), - ( - "cython-exclude=", - None, - "Sets the binding Cython binding directive" - ) + ('language-level=', None, + 'Sets the python language syntax to use "2", "3", "3str".'), + ("binding", None, "Sets the binding Cython binding directive"), + ("profile", None, "Sets the profile Cython binding directive"), + ("embedsignature", None, "Sets the binding Cython binding directive"), + ("cython-exclude=", None, "Sets the binding Cython binding directive") ] + _build_ext.user_options - boolean_options = [ - "binding", "profile", "embedsignature" - ] + _build_ext.boolean_options + boolean_options = ["binding", "profile", "embedsignature" + ] + _build_ext.boolean_options def initialize_options(self): @@ -54,58 +34,65 @@ def initialize_options(self): self.embedsignature = None self.cython_exclude = None super(new_build_ext, self).initialize_options() - + def finalize_options(self): print("cuml_build_ext::finalize_options") - self.set_undefined_options('build', - ('build_lib', 'build_lib'), - ('build_temp', 'build_temp'), - ('compiler', 'compiler'), - ('debug', 'debug'), - ('force', 'force'), - ('parallel', 'parallel'), - ('plat_name', 'plat_name'), - ) + self.set_undefined_options( + 'build', + ('build_lib', 'build_lib'), + ('build_temp', 'build_temp'), + ('compiler', 'compiler'), + ('debug', 'debug'), + ('force', 'force'), + ('parallel', 'parallel'), + ('plat_name', 'plat_name'), + ) if self.distribution.ext_modules: if self.language_level is None: self.language_level = str(sys.version_info[0]) - assert self.language_level in ('2', '3', '3str'), 'Incorrect Cython language level ("{0}")'.format(self.language_level) + assert self.language_level in ( + '2', '3', + '3str'), 'Incorrect Cython language level ("{0}")'.format( + self.language_level) compiler_directives = dict(language_level=self.language_level) if (self.binding is not None): - compiler_directives.update({ "binding": bool(self.binding) }) + self.binding = bool(self.binding) + compiler_directives.update({"binding": self.binding}) if (self.profile is not None): - compiler_directives.update({ "profile": bool(self.profile) }) + self.profile = bool(self.profile) + compiler_directives.update({"profile": self.profile}) if (self.embedsignature is not None): - compiler_directives.update({ "embedsignature": bool(self.embedsignature) }) + self.embedsignature = bool(self.embedsignature) + compiler_directives.update( + {"embedsignature": self.embedsignature}) - cythonize_kwargs = { - } + cythonize_kwargs = {} if (self.cython_exclude is not None): - if (type(self.cython_exclude) == str): + if (isinstance(self.cython_exclude, str)): self.cython_exclude = list(self.cython_exclude) - cythonize_kwargs.update({ "exclude": self.cython_exclude }) - + cythonize_kwargs.update({"exclude": self.cython_exclude}) + nthreads = getattr(self, 'parallel', None) # -j option in Py3.5+ nthreads = int(nthreads) if nthreads else None - + from Cython.Build.Dependencies import cythonize - - self.distribution.ext_modules[:] = cythonize( - self.distribution.ext_modules, - nthreads=nthreads, + + self.distribution.ext_modules = cythonize( + self.distribution.ext_modules, + nthreads=nthreads, force=self.force, compiler_directives=compiler_directives, - **cythonize_kwargs - ) + **cythonize_kwargs) + setuptools.command.build_ext.build_ext.finalize_options(self) diff --git a/python/setup.cfg b/python/setup.cfg index bc65780383..3a2def3551 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -13,3 +13,9 @@ versionfile_source = cuml/_version.py versionfile_build = cuml/_version.py tag_prefix = v parentdir_prefix = cuml- + +[build_ext] +binding = True +language_level = 3 +profile = False +embedsignature = True \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index e462234b51..c6191d1f31 100644 --- a/python/setup.py +++ b/python/setup.py @@ -24,6 +24,10 @@ from setuputils import get_environment_option from setuputils import get_cli_option from setuputils import use_raft_package +from distutils.command.build import build as _build +from cuml_build_ext import new_build_ext + +# from Cython.Distutils.build_ext import new_build_ext import glob import numpy @@ -34,20 +38,7 @@ import versioneer import warnings - -if "--singlegpu" in sys.argv: - from Cython.Build import cythonize - from setuptools.command.build_ext import build_ext -else: - try: - from Cython.Distutils.build_ext import new_build_ext as build_ext - except ImportError: - from setuptools.command.build_ext import build_ext - -install_requires = [ - 'numba', - 'cython' -] +install_requires = ['numba', 'cython'] ############################################################################## # - Print of build options used by setup.py -------------------------------- @@ -119,19 +110,14 @@ # cumlcomms and nccl are still needed for multigpu algos not based # on libcumlprims -libs = ['cuda', - 'cuml++', - 'rmm'] - -include_dirs = ['../cpp/src', - '../cpp/include', - '../cpp/src_prims', - raft_include_dir, - '../cpp/comms/std/src', - '../cpp/comms/std/include', - cuda_include_dir, - numpy.get_include(), - os.path.dirname(sysconfig.get_path("include"))] +libs = ['cuda', 'cuml++', 'rmm'] + +include_dirs = [ + '../cpp/src', '../cpp/include', '../cpp/src_prims', raft_include_dir, + '../cpp/comms/std/src', '../cpp/comms/std/include', cuda_include_dir, + numpy.get_include(), + os.path.dirname(sysconfig.get_path("include")) +] # Exclude multigpu components that use libcumlprims if --singlegpu is used cython_exc_list = [] @@ -163,38 +149,94 @@ cmdclass = dict() cmdclass.update(versioneer.get_cmdclass()) -cmdclass["build_ext"] = build_ext +cmdclass["build_ext"] = new_build_ext if not libcuml_path: libcuml_path = '../cpp/build/' -extensions = [ - Extension("*", - sources=["cuml/**/**/*.pyx"], - include_dirs=include_dirs, - library_dirs=[get_python_lib(), libcuml_path], - runtime_library_dirs=[cuda_lib_dir, - os.path.join(os.sys.prefix, "lib")], - libraries=libs, - language='c++', - extra_compile_args=['-std=c++11']) -] +# extensions = [ +# Extension("*", +# sources=["cuml/**/**/*.pyx"], +# include_dirs=include_dirs, +# library_dirs=[get_python_lib(), libcuml_path], +# runtime_library_dirs=[cuda_lib_dir, +# os.path.join(os.sys.prefix, "lib")], +# libraries=libs, +# language='c++', +# extra_compile_args=['-std=c++11']) +# ] -for e in extensions: - # TODO: this exclude is not working, need to research way to properly - # exclude files for parallel build. See issue - # https://github.com/rapidsai/cuml/issues/2037 - # e.exclude = cython_exc_list - e.cython_directives = dict( - profile=False, language_level=3, embedsignature=True - ) +# for e in extensions: +# # TODO: this exclude is not working, need to research way to properly +# # exclude files for parallel build. See issue +# # https://github.com/rapidsai/cuml/issues/2037 +# # e.exclude = cython_exc_list +# e.cython_directives = dict( +# profile=False, language_level=3, embedsignature=True, binding=True +# ) -if "--singlegpu" in sys.argv: - print("Full cythonization in parallel is not supported for singlegpu " + - "target for now.") - extensions = cythonize(extensions, - exclude=cython_exc_list) - sys.argv.remove('--singlegpu') +# if "--singlegpu" in sys.argv: +# print("Full cythonization in parallel is not supported for singlegpu " + +# "target for now.") +# extensions = cythonize(extensions, +# exclude=cython_exc_list) +# sys.argv.remove('--singlegpu') + + +class cuml_build(_build): + + user_options = [('singlegpu=', None, + 'Determines whether to compile for single gpu or not') + ] + _build.user_options + + boolean_options = ["singlegpu"] + _build.boolean_options + + def initialize_options(self): + + print("cuml_build::initialize_options") + + self.singlegpu = False + + # import cuml_build_ext + + # self.distribution.cmdclass['build_ext'] = cuml_build_ext.new_build_ext + + # if you wanted to use the Extension class from Cython + # from Cython.Distutils.extension import Extension + # ext = Extension(....) + # self.distribution.ext_modules = [ext] + + extensions = [ + Extension("*", + sources=["cuml/**/*.pyx"], + include_dirs=include_dirs, + library_dirs=[get_python_lib(), libcuml_path], + runtime_library_dirs=[ + cuda_lib_dir, + os.path.join(os.sys.prefix, "lib") + ], + libraries=libs, + language='c++', + extra_compile_args=['-std=c++11']) + ] + + self.distribution.ext_modules = extensions + + super().initialize_options() + + def finalize_options(self): + + print("cuml_build::finalize_options") + + if (self.singlegpu): + sub_build_ext = self.distribution.get_command_obj("build_ext") + + sub_build_ext.cython_exc_list = cython_exc_list + + super().finalize_options() + + +cmdclass["build"] = cuml_build ############################################################################## # - Python package generation ------------------------------------------------ @@ -203,18 +245,15 @@ description="cuML - RAPIDS ML Algorithms", version=versioneer.get_version(), classifiers=[ - "Intended Audience :: Developers", - "Programming Language :: Python", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7" + "Intended Audience :: Developers", "Programming Language :: Python", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7" ], author="NVIDIA Corporation", setup_requires=['cython'], - ext_modules=extensions, packages=find_packages(include=['cuml', 'cuml.*'], exclude=python_exc_list), install_requires=install_requires, license="Apache", cmdclass=cmdclass, - zip_safe=False - ) + zip_safe=False) diff --git a/python/setup_new.py b/python/setup_new.py deleted file mode 100644 index f37fc95e75..0000000000 --- a/python/setup_new.py +++ /dev/null @@ -1,272 +0,0 @@ -# -# Copyright (c) 2018-2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from distutils.sysconfig import get_python_lib -from pathlib import Path -from pprint import pprint -from setuptools import find_packages -from setuptools import setup -from setuptools.extension import Extension -from setuputils import clean_folder -from setuputils import get_environment_option -from setuputils import get_cli_option -from setuputils import use_raft_package -from distutils.command.build import build as _build -# from cuml_build_ext import new_build_ext - -from Cython.Distutils.build_ext import new_build_ext - -import glob -import numpy -import os -import shutil -import sys -import sysconfig -import versioneer -import warnings - -install_requires = [ - 'numba', - 'cython' -] - -############################################################################## -# - Print of build options used by setup.py -------------------------------- - -cuda_home = get_environment_option("CUDA_HOME") -libcuml_path = get_environment_option('CUML_BUILD_PATH') -raft_path = get_environment_option('RAFT_PATH') - -clean_artifacts = get_cli_option('clean') -single_gpu_build = get_cli_option('--singlegpu') - -############################################################################## -# - Dependencies include and lib folder setup -------------------------------- - -if not cuda_home: - nvcc_path = shutil.which('nvcc') - if (not nvcc_path): - raise FileNotFoundError("nvcc not found.") - - cuda_home = str(Path(nvcc_path).parent.parent) - print("-- Using nvcc to detect CUDA, found at " + str(cuda_home)) -cuda_include_dir = os.path.join(cuda_home, "include") -cuda_lib_dir = os.path.join(cuda_home, "lib64") - -############################################################################## -# - Clean target ------------------------------------------------------------- - -if clean_artifacts: - print("-- Cleaning all Python and Cython build artifacts...") - - treelite_path = "" - libcuml_path = "" - - try: - setup_file_path = str(Path(__file__).parent.absolute()) - shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True) - shutil.rmtree(setup_file_path + '/_external_repositories', - ignore_errors=True) - shutil.rmtree(setup_file_path + '/cuml.egg-info', ignore_errors=True) - shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True) - - os.remove(setup_file_path + '/cuml/raft') - - clean_folder(setup_file_path + '/cuml') - shutil.rmtree(setup_file_path + '/build') - - except IOError: - pass - - # need to terminate script so cythonizing doesn't get triggered after - # cleanup unintendedly - sys.argv.remove("clean") - - if "--all" in sys.argv: - sys.argv.remove("--all") - - if len(sys.argv) == 1: - sys.exit(0) - -############################################################################## -# - Cloning RAFT and dependencies if needed ---------------------------------- - -# Use RAFT repository in cuml.raft - -raft_include_dir = use_raft_package(raft_path, libcuml_path) - -############################################################################## -# - Cython extensions build and parameters ----------------------------------- - -# cumlcomms and nccl are still needed for multigpu algos not based -# on libcumlprims -libs = ['cuda', - 'cuml++', - 'rmm'] - -include_dirs = ['../cpp/src', - '../cpp/include', - '../cpp/src_prims', - raft_include_dir, - '../cpp/comms/std/src', - '../cpp/comms/std/include', - cuda_include_dir, - numpy.get_include(), - os.path.dirname(sysconfig.get_path("include"))] - -# Exclude multigpu components that use libcumlprims if --singlegpu is used -cython_exc_list = [] -python_exc_list = [] - -if "--multigpu" in sys.argv: - warnings.warn("Flag --multigpu is deprecated. By default cuML is" - "built with multi GPU support. To disable it use the flag" - "--singlegpu") - sys.argv.remove('--multigpu') - -if "--singlegpu" in sys.argv: - cython_exc_list = glob.glob('cuml/*/*_mg.pyx') - cython_exc_list = cython_exc_list + glob.glob('cuml/*/*_mg.pxd') - cython_exc_list.append('cuml/nccl/nccl.pyx') - cython_exc_list.append('cuml/dask/common/comms_utils.pyx') - - print('--singlegpu: excluding the following Cython components:') - pprint(cython_exc_list) - - python_exc_list = ["*.dask", "*.dask.*"] -else: - libs.append('cumlprims') - libs.append('cumlcomms') - libs.append('nccl') - - sys_include = os.path.dirname(sysconfig.get_path("include")) - include_dirs.append("%s/cumlprims" % sys_include) - -cmdclass = dict() -cmdclass.update(versioneer.get_cmdclass()) -cmdclass["build_ext"] = new_build_ext - -if not libcuml_path: - libcuml_path = '../cpp/build/' - -# extensions = [ -# Extension("*", -# sources=["cuml/**/**/*.pyx"], -# include_dirs=include_dirs, -# library_dirs=[get_python_lib(), libcuml_path], -# runtime_library_dirs=[cuda_lib_dir, -# os.path.join(os.sys.prefix, "lib")], -# libraries=libs, -# language='c++', -# extra_compile_args=['-std=c++11']) -# ] - -# for e in extensions: -# # TODO: this exclude is not working, need to research way to properly -# # exclude files for parallel build. See issue -# # https://github.com/rapidsai/cuml/issues/2037 -# # e.exclude = cython_exc_list -# e.cython_directives = dict( -# profile=False, language_level=3, embedsignature=True, binding=True -# ) - -# if "--singlegpu" in sys.argv: -# print("Full cythonization in parallel is not supported for singlegpu " + -# "target for now.") -# extensions = cythonize(extensions, -# exclude=cython_exc_list) -# sys.argv.remove('--singlegpu') - -class cuml_build(_build): - - user_options = [ - ( - 'single-gpu=', - None, - 'Determines whether to compile for single gpu or not' - ) - ] + _build.user_options - - boolean_options = [ - "single-gpu" - ] + _build.boolean_options - - def initialize_options(self): - - print("cuml_build::initialize_options") - - self.single_gpu = False - - # import cuml_build_ext - - # self.distribution.cmdclass['build_ext'] = cuml_build_ext.new_build_ext - - # if you wanted to use the Extension class from Cython - # from Cython.Distutils.extension import Extension - # ext = Extension(....) - # self.distribution.ext_modules = [ext] - - extensions = [ - Extension("*", - sources=["cuml/**/*.pyx"], - include_dirs=include_dirs, - library_dirs=[get_python_lib(), libcuml_path], - runtime_library_dirs=[cuda_lib_dir, - os.path.join(os.sys.prefix, "lib")], - libraries=libs, - language='c++', - extra_compile_args=['-std=c++11']) - ] - - self.distribution.ext_modules = extensions - - super().initialize_options() - - def finalize_options(self): - - print("cuml_build::finalize_options") - - if (self.single_gpu): - sub_build_ext = self.distribution.get_command_obj("build_ext") - - sub_build_ext.cython_exc_list = cython_exc_list - - super().finalize_options() - - -cmdclass["build"] = cuml_build - -############################################################################## -# - Python package generation ------------------------------------------------ - -setup(name='cuml', - description="cuML - RAPIDS ML Algorithms", - version=versioneer.get_version(), - classifiers=[ - "Intended Audience :: Developers", - "Programming Language :: Python", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7" - ], - author="NVIDIA Corporation", - setup_requires=['cython'], - packages=find_packages(include=['cuml', 'cuml.*'], - exclude=python_exc_list), - install_requires=install_requires, - license="Apache", - cmdclass=cmdclass, - zip_safe=False - ) From c938d60939147f48dde3c8a3d1a18076b097f1a3 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Fri, 10 Jul 2020 18:34:17 -0600 Subject: [PATCH 05/15] Removing accidentally committed file --- python/setup copy.py | 228 ------------------------------------------- 1 file changed, 228 deletions(-) delete mode 100644 python/setup copy.py diff --git a/python/setup copy.py b/python/setup copy.py deleted file mode 100644 index e72d43d875..0000000000 --- a/python/setup copy.py +++ /dev/null @@ -1,228 +0,0 @@ -# -# Copyright (c) 2018-2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from distutils.sysconfig import get_python_lib -from pathlib import Path -from pprint import pprint -from setuptools import find_packages -from setuptools import setup -from setuptools.extension import Extension -from setuputils import clean_folder -from setuputils import get_environment_option -from setuputils import get_cli_option -from setuputils import use_raft_package - -import glob -import numpy -import os -import shutil -import sys -import sysconfig -import versioneer -import warnings - - -if "--singlegpu" in sys.argv: - from Cython.Build import cythonize - from setuptools.command.build_ext import build_ext -else: - try: - from Cython.Distutils.build_ext import new_build_ext as build_ext - except ImportError: - from setuptools.command.build_ext import build_ext - -install_requires = [ - 'numba', - 'cython' -] - -############################################################################## -# - Print of build options used by setup.py -------------------------------- - -cuda_home = get_environment_option("CUDA_HOME") -libcuml_path = get_environment_option('CUML_BUILD_PATH') -raft_path = get_environment_option('RAFT_PATH') - -clean_artifacts = get_cli_option('clean') -single_gpu_build = get_cli_option('--singlegpu') - -############################################################################## -# - Dependencies include and lib folder setup -------------------------------- - -if not cuda_home: - nvcc_path = shutil.which('nvcc') - if (not nvcc_path): - raise FileNotFoundError("nvcc not found.") - - cuda_home = str(Path(nvcc_path).parent.parent) - print("-- Using nvcc to detect CUDA, found at " + str(cuda_home)) -cuda_include_dir = os.path.join(cuda_home, "include") -cuda_lib_dir = os.path.join(cuda_home, "lib64") - -############################################################################## -# - Clean target ------------------------------------------------------------- - -if clean_artifacts: - print("-- Cleaning all Python and Cython build artifacts...") - - treelite_path = "" - libcuml_path = "" - - try: - setup_file_path = str(Path(__file__).parent.absolute()) - shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True) - shutil.rmtree(setup_file_path + '/_external_repositories', - ignore_errors=True) - shutil.rmtree(setup_file_path + '/cuml.egg-info', ignore_errors=True) - shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True) - - os.remove(setup_file_path + '/cuml/raft') - - clean_folder(setup_file_path + '/cuml') - shutil.rmtree(setup_file_path + '/build') - - except IOError: - pass - - # need to terminate script so cythonizing doesn't get triggered after - # cleanup unintendedly - sys.argv.remove("clean") - - if "--all" in sys.argv: - sys.argv.remove("--all") - - if len(sys.argv) == 1: - sys.exit(0) - -############################################################################## -# - Cloning RAFT and dependencies if needed ---------------------------------- - -# Use RAFT repository in cuml.raft - -raft_include_dir = use_raft_package(raft_path, libcuml_path) - -############################################################################## -# - Cython extensions build and parameters ----------------------------------- - -# cumlcomms and nccl are still needed for multigpu algos not based -# on libcumlprims -libs = ['cuda', - 'cuml++', - 'rmm'] - -include_dirs = ['../cpp/src', - '../cpp/include', - '../cpp/src_prims', - raft_include_dir, - '../cpp/comms/std/src', - '../cpp/comms/std/include', - cuda_include_dir, - numpy.get_include(), - os.path.dirname(sysconfig.get_path("include"))] - -# Exclude multigpu components that use libcumlprims if --singlegpu is used -cython_exc_list = [] -python_exc_list = [] - -if "--multigpu" in sys.argv: - warnings.warn("Flag --multigpu is deprecated. By default cuML is" - "built with multi GPU support. To disable it use the flag" - "--singlegpu") - sys.argv.remove('--multigpu') - -if "--singlegpu" in sys.argv: - cython_exc_list = glob.glob('cuml/*/*_mg.pyx') - cython_exc_list = cython_exc_list + glob.glob('cuml/*/*_mg.pxd') - cython_exc_list.append('cuml/nccl/nccl.pyx') - cython_exc_list.append('cuml/dask/common/comms_utils.pyx') - - print('--singlegpu: excluding the following Cython components:') - pprint(cython_exc_list) - - python_exc_list = ["*.dask", "*.dask.*"] -else: - libs.append('cumlprims') - libs.append('cumlcomms') - libs.append('nccl') - - sys_include = os.path.dirname(sysconfig.get_path("include")) - include_dirs.append("%s/cumlprims" % sys_include) - -cmdclass = dict() -cmdclass.update(versioneer.get_cmdclass()) -cmdclass["build_ext"] = build_ext - -if not libcuml_path: - libcuml_path = '../cpp/build/' - -# from Cython.Compiler import Options - -# Options.embed_pos_in_docstring = True - -extensions = [ - Extension("*", - sources=["cuml/**/**/*.pyx"], - include_dirs=include_dirs, - library_dirs=[get_python_lib(), libcuml_path], - runtime_library_dirs=[cuda_lib_dir, - os.path.join(os.sys.prefix, "lib")], - libraries=libs, - language='c++', - extra_compile_args=['-std=c++11'], - define_macros=[("CYTHON_TRACE=1",)]) -] - -for e in extensions: - # TODO: this exclude is not working, need to research way to properly - # exclude files for parallel build. See issue - # https://github.com/rapidsai/cuml/issues/2037 - # e.exclude = cython_exc_list - e.cython_directives = dict( - profile=False, language_level=3, embedsignature=True, linetrace=True, binding=True - ) - -if "--singlegpu" in sys.argv: - print("Full cythonization in parallel is not supported for singlegpu " + - "target for now.") - directives = dict( - profile=False, language_level=3, embedsignature=True, linetrace=True, binding=True - ) - extensions = cythonize(extensions, - exclude=cython_exc_list, nthreads=6, compiler_directives=directives) - sys.argv.remove('--singlegpu') - -############################################################################## -# - Python package generation ------------------------------------------------ - -setup(name='cuml', - description="cuML - RAPIDS ML Algorithms", - version=versioneer.get_version(), - classifiers=[ - "Intended Audience :: Developers", - "Programming Language :: Python", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7" - ], - author="NVIDIA Corporation", - setup_requires=['cython'], - ext_modules=extensions, - packages=find_packages(include=['cuml', 'cuml.*'], - exclude=python_exc_list), - install_requires=install_requires, - license="Apache", - cmdclass=cmdclass, - zip_safe=False - ) From 9721016a0c7e07a1db43e0bba9e0e9c34457b368 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Fri, 10 Jul 2020 18:37:17 -0600 Subject: [PATCH 06/15] Adding PR to CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61005671e1..429bef841a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - PR #2394: Adding cosine & correlation distance for KNN - PR #2392: PCA can accept sparse inputs, and sparse prim for computing covariance - PR #2465: Support pandas 1.0+ +- PR #2541: Improve Documentation Examples and Source Linking ## Improvements - PR #2336: Eliminate `rmm.device_array` usage From acc33118b1566c74f6b066c5e9a7917d5abb52f3 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Fri, 10 Jul 2020 19:13:55 -0600 Subject: [PATCH 07/15] Fixing style issues --- docs/source/conf.py | 47 ++++++++++++++------------------------------- python/setup.py | 2 -- 2 files changed, 14 insertions(+), 35 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index dc32b87c88..f0fa97ff96 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,15 +18,11 @@ # import os import sys -sys.path.insert(0, os.path.abspath('../../python')) +from sphinxext.github_link import make_linkcode_resolve + -# If extensions (or modules to document with autodoc) are in another -# directory, add these directories to sys.path here. If the directory -# is relative to the documentation root, use os.path.abspath to make it -# absolute, like shown here. -sys.path.insert(0, os.path.abspath('./sphinxext')) +sys.path.insert(0, os.path.abspath('../../python')) -from github_link import make_linkcode_resolve # -- General configuration ------------------------------------------------ @@ -63,7 +59,7 @@ # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = {".rst": "restructuredtext", ".md": "markdown"} +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} # The master toctree document. master_doc = 'index' @@ -100,7 +96,6 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False - # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for @@ -120,7 +115,6 @@ html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. @@ -132,17 +126,13 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] - -html_js_files = [ - "example_mod.js" -] +html_js_files = ["example_mod.js"] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'cuMLdoc' - # -- Options for LaTeX output --------------------------------------------- latex_elements = { @@ -167,20 +157,14 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'cuml.tex', 'cuml Documentation', - 'nvidia', 'manual'), + (master_doc, 'cuml.tex', 'cuml Documentation', 'nvidia', 'manual'), ] - # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'cuml', 'cuml Documentation', - [author], 1) -] - +man_pages = [(master_doc, 'cuml', 'cuml Documentation', [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -188,16 +172,13 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'cuml', 'cuml Documentation', - author, 'cuml', 'One line description of project.', - 'Miscellaneous'), + (master_doc, 'cuml', 'cuml Documentation', author, 'cuml', + 'One line description of project.', 'Miscellaneous'), ] - # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'https://docs.python.org/': None} - # Config numpydoc numpydoc_show_inherited_class_members = False numpydoc_class_members_toctree = False @@ -207,9 +188,9 @@ def setup(app): app.add_css_file('params.css') app.add_css_file('copybutton.css') -# The following is used by sphinx.ext.linkcode to provide links to github -linkcode_resolve = make_linkcode_resolve('cuml', - 'https://github.com/rapidsai/' - 'cuml/blob/{revision}/python/' - '{package}/{path}#L{lineno}') +# The following is used by sphinx.ext.linkcode to provide links to github +linkcode_resolve = make_linkcode_resolve( + 'cuml', 'https://github.com/rapidsai/' + 'cuml/blob/{revision}/python/' + '{package}/{path}#L{lineno}') diff --git a/python/setup.py b/python/setup.py index c6191d1f31..62f4ca35e4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -199,8 +199,6 @@ def initialize_options(self): # import cuml_build_ext - # self.distribution.cmdclass['build_ext'] = cuml_build_ext.new_build_ext - # if you wanted to use the Extension class from Cython # from Cython.Distutils.extension import Extension # ext = Extension(....) From 43c490ffdd3e3125dc48bde21d5397760371be00 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Thu, 6 Aug 2020 23:04:44 -0600 Subject: [PATCH 08/15] Large cleanup of sphinx documentation warnings --- docs/source/_static/copybutton.css | 3 +- docs/source/_static/references.css | 23 +++ docs/source/conf.py | 3 +- python/cuml/cluster/dbscan.pyx | 2 +- python/cuml/common/array.py | 18 +-- python/cuml/common/kernel_utils.py | 4 +- python/cuml/common/logger.pyx | 2 - python/cuml/common/memory_utils.py | 4 +- python/cuml/common/opg_data_utils_mg.pyx | 4 +- python/cuml/common/sparsefuncs.py | 3 +- python/cuml/dask/cluster/kmeans.py | 2 +- python/cuml/dask/datasets/classification.py | 51 +++---- python/cuml/dask/datasets/regression.py | 39 +++--- python/cuml/dask/decomposition/pca.py | 6 +- python/cuml/dask/decomposition/tsvd.py | 46 +++--- .../dask/ensemble/randomforestclassifier.py | 35 +++-- .../dask/ensemble/randomforestregressor.py | 27 ++-- python/cuml/dask/linear_model/elastic_net.py | 2 +- python/cuml/dask/linear_model/lasso.py | 2 +- .../dask/linear_model/linear_regression.py | 2 +- python/cuml/dask/linear_model/ridge.py | 3 +- python/cuml/dask/manifold/umap.py | 36 ++--- python/cuml/datasets/arima.pyx | 6 +- python/cuml/datasets/regression.pyx | 4 +- python/cuml/decomposition/pca.pyx | 2 +- python/cuml/decomposition/tsvd.pyx | 16 ++- .../cuml/ensemble/randomforestclassifier.pyx | 9 +- .../cuml/ensemble/randomforestregressor.pyx | 64 +++++---- .../feature_extraction/_tfidf_vectorizer.py | 10 +- .../cuml/feature_extraction/_vectorizers.py | 57 ++++---- python/cuml/fil/fil.pyx | 131 +++++++++--------- python/cuml/linear_model/elastic_net.pyx | 2 +- python/cuml/linear_model/lasso.pyx | 2 +- .../cuml/linear_model/linear_regression.pyx | 2 +- .../cuml/linear_model/logistic_regression.pyx | 2 +- python/cuml/linear_model/mbsgd_classifier.pyx | 2 +- python/cuml/linear_model/mbsgd_regressor.pyx | 2 +- python/cuml/linear_model/ridge.pyx | 2 +- python/cuml/manifold/umap.pyx | 77 +++++----- python/cuml/metrics/_classification.py | 4 + python/cuml/metrics/_ranking.py | 43 +++--- python/cuml/metrics/pairwise_distances.pyx | 2 +- .../cuml/neighbors/kneighbors_classifier.pyx | 3 +- .../cuml/neighbors/kneighbors_regressor.pyx | 2 +- python/cuml/neighbors/nearest_neighbors.pyx | 2 +- python/cuml/preprocessing/encoders.py | 5 +- python/cuml/preprocessing/model_selection.py | 74 +++++----- .../random_projection/random_projection.pyx | 35 ++--- python/cuml/solvers/qn.pyx | 2 +- python/cuml/solvers/sgd.pyx | 34 ++--- python/cuml/svm/svc.pyx | 4 +- python/cuml/svm/svr.pyx | 62 +++++---- python/cuml/test/test_arima.py | 14 +- python/cuml/test/test_pickle.py | 2 +- python/cuml/test/test_svm.py | 8 +- python/cuml/tsa/arima.pyx | 66 +++++---- python/cuml/tsa/auto_arima.pyx | 21 +-- python/cuml/tsa/holtwinters.pyx | 79 +++++------ 58 files changed, 640 insertions(+), 529 deletions(-) create mode 100644 docs/source/_static/references.css diff --git a/docs/source/_static/copybutton.css b/docs/source/_static/copybutton.css index 31578bc0a8..9ec2ef0ac2 100644 --- a/docs/source/_static/copybutton.css +++ b/docs/source/_static/copybutton.css @@ -35,4 +35,5 @@ div.highlight:hover span.copybutton { div.highlight:hover span.copybutton:hover { background-color: #20252B; -} \ No newline at end of file +} + diff --git a/docs/source/_static/references.css b/docs/source/_static/references.css new file mode 100644 index 0000000000..225cf13ba9 --- /dev/null +++ b/docs/source/_static/references.css @@ -0,0 +1,23 @@ + +/* Fix references to not look like parameters */ +dl.citation > dt.label { + display: unset !important; + float: left !important; + border: unset !important; + background: unset !important; + padding: unset !important; + margin: unset !important; + font-size: unset !important; + line-height: unset !important; + padding-right: 0.5rem !important; +} + +/* Add opening bracket */ +dl.citation > dt.label > span::before { + content: "["; +} + +/* Add closing bracket */ +dl.citation > dt.label > span::after { + content: "]"; +} \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 9099d03ddb..1db62bd9bd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -190,8 +190,9 @@ def setup(app): - app.add_css_file('params.css') app.add_css_file('copybutton.css') + app.add_css_file('params.css') + app.add_css_file('references.css') # The following is used by sphinx.ext.linkcode to provide links to github diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 263a4b8bdd..b4764f800d 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -93,7 +93,7 @@ class DBSCAN(Base): neighbours. Examples - --------- + -------- .. code-block:: python diff --git a/python/cuml/common/array.py b/python/cuml/common/array.py index 9b7661c654..d5c22d6638 100644 --- a/python/cuml/common/array.py +++ b/python/cuml/common/array.py @@ -31,7 +31,7 @@ class CumlArray(Buffer): """ Array represents an abstracted array allocation. It can be instantiated by itself, creating an rmm.DeviceBuffer underneath, or can be instantiated by - __cuda_array_interface__ or __array_interface__ compliant arrays, in which + ``__cuda_array_interface__`` or ``__array_interface__`` compliant arrays, in which case it'll keep a reference to that data underneath. Also can be created from a pointer, specifying the characteristics of the array, in that case the owner of the data referred to by the pointer should be specified @@ -40,7 +40,7 @@ class CumlArray(Buffer): Parameters ---------- - data : rmm.DeviceBuffer, cudf.Buffer, array_like, int, bytes, bytearray or + data : rmm.DeviceBuffer, cudf.Buffer, array_like, int, bytes, bytearray or \ memoryview An array-like object or integer representing a device or host pointer to pre-allocated memory. @@ -71,7 +71,7 @@ class CumlArray(Buffer): strides : tuple of ints Strides of the data __cuda_array_interface__ : dictionary - __cuda_array_interface__ to interop with other libraries. + ``__cuda_array_interface__`` to interop with other libraries. Object Methods -------------- @@ -204,12 +204,12 @@ def to_output(self, output_type='cupy', output_dtype=None): ---------- output_type : string Format to convert the array to. Acceptable formats are: - 'cupy' - to cupy array - 'numpy' - to numpy (host) array - 'numba' - to numba device array - 'dataframe' - to cuDF DataFrame - 'series' - to cuDF Series - 'cudf' - to cuDF Series if array is single dimensional, to + - 'cupy' - to cupy array + - 'numpy' - to numpy (host) array + - 'numba' - to numba device array + - 'dataframe' - to cuDF DataFrame + - 'series' - to cuDF Series + - 'cudf' - to cuDF Series if array is single dimensional, to \ DataFrame otherwise output_dtype : string, optional Optionally cast the array to a specified dtype, creating diff --git a/python/cuml/common/kernel_utils.py b/python/cuml/common/kernel_utils.py index 29f28ed6d9..939a5917c8 100644 --- a/python/cuml/common/kernel_utils.py +++ b/python/cuml/common/kernel_utils.py @@ -66,8 +66,8 @@ def cuda_kernel_factory(nvrtc_kernel_str, dtypes, kernel_name=None): included in the kernel string. These will be added by this function and the function name will be made unique, based on the given dtypes. - Example - ------- + Examples + -------- The following kernel string with dtypes = [float, double, int] diff --git a/python/cuml/common/logger.pyx b/python/cuml/common/logger.pyx index a68bead6e8..63f0d085e4 100644 --- a/python/cuml/common/logger.pyx +++ b/python/cuml/common/logger.pyx @@ -97,7 +97,6 @@ def set_level(level): .. code-block:: python - # regular usage of setting a logging level for all subsequent logs # in this case, it will enable all logs upto and including `info()` logger.set_level(logger.level_info) @@ -147,7 +146,6 @@ def set_pattern(pattern): .. code-block:: python - # regular usage of setting a logging pattern for all subsequent logs logger.set_pattern("--> [%H-%M-%S] %v") diff --git a/python/cuml/common/memory_utils.py b/python/cuml/common/memory_utils.py index 56d2ee4300..f73b1e13ac 100644 --- a/python/cuml/common/memory_utils.py +++ b/python/cuml/common/memory_utils.py @@ -76,13 +76,13 @@ def rmm_cupy_ary(cupy_fn, *args, **kwargs): Keyword named arguments to pass to the CuPy function - Note: this function should be used if the result of cupy_fn creates + .. note:: this function should be used if the result of cupy_fn creates a new array. Functions to create a new CuPy array by reference to existing device array (through __cuda_array_interface__) can be used directly. Examples - --------- + -------- .. code-block:: python diff --git a/python/cuml/common/opg_data_utils_mg.pyx b/python/cuml/common/opg_data_utils_mg.pyx index 8204c733fc..e789149e3d 100644 --- a/python/cuml/common/opg_data_utils_mg.pyx +++ b/python/cuml/common/opg_data_utils_mg.pyx @@ -114,7 +114,7 @@ def build_rank_size_pair(parts_to_sizes, rank): parts_to_sizes: array of tuples in the format: [(rank,size)] rank: rank to be mapped - Returns: + Returns -------- ptr: vector pointer of the RankSizePair* """ @@ -162,7 +162,7 @@ def build_part_descriptor(m, n, rank_size_t, rank): building the part descriptor rank: rank to be mapped - Returns: + Returns -------- ptr: PartDescriptor object """ diff --git a/python/cuml/common/sparsefuncs.py b/python/cuml/common/sparsefuncs.py index 3949e9e441..adc3a0e800 100644 --- a/python/cuml/common/sparsefuncs.py +++ b/python/cuml/common/sparsefuncs.py @@ -150,7 +150,8 @@ def _insert_zeros(ary, zero_indices): Create a new array of len(ary + zero_indices) where zero_indices indicates indexes of 0s in the new array. Ary is used to fill the rest. - Example: + Examples + -------- _insert_zeros([1, 2, 3], [1, 3]) => [1, 0, 2, 0, 3] """ if len(zero_indices) == 0: diff --git a/python/cuml/dask/cluster/kmeans.py b/python/cuml/dask/cluster/kmeans.py index ac654fd753..dd8d732174 100644 --- a/python/cuml/dask/cluster/kmeans.py +++ b/python/cuml/dask/cluster/kmeans.py @@ -60,7 +60,7 @@ class KMeans(BaseEstimator, DelayedPredictionMixin, DelayedTransformMixin): random_state : int (default = 1) If you want results to be the same when you restart Python, select a state. - init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray} + init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray} \ (default = 'scalable-k-means++') 'scalable-k-means++' or 'k-means||': Uses fast and stable scalable kmeans++ intialization. diff --git a/python/cuml/dask/datasets/classification.py b/python/cuml/dask/datasets/classification.py index ea5906af83..aeed38ba41 100644 --- a/python/cuml/dask/datasets/classification.py +++ b/python/cuml/dask/datasets/classification.py @@ -41,16 +41,18 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None, order='F', dtype='float32', n_parts=None, client=None): - """Generate a random n-class classification problem. + """ + Generate a random n-class classification problem. This initially creates clusters of points normally distributed (std=1) - about vertices of an ``n_informative``-dimensional hypercube with sides of - length ``2*class_sep`` and assigns an equal number of clusters to each + about vertices of an `n_informative`-dimensional hypercube with sides of + length ``2 * class_sep`` and assigns an equal number of clusters to each class. It introduces interdependence between these features and adds various types of further noise to the data. + Without shuffling, ``X`` horizontally stacks features in the following - order: the primary ``n_informative`` features, followed by ``n_redundant`` - linear combinations of the informative features, followed by ``n_repeated`` + order: the primary `n_informative` features, followed by `n_redundant` + linear combinations of the informative features, followed by `n_repeated` duplicates, drawn randomly with replacement from the informative and redundant features. The remaining features are filled with random noise. Thus, without shuffling, all useful features are contained in the columns @@ -99,15 +101,15 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, n_samples : int, optional (default=100) The number of samples. n_features : int, optional (default=20) - The total number of features. These comprise ``n_informative`` - informative features, ``n_redundant`` redundant features, - ``n_repeated`` duplicated features and + The total number of features. These comprise `n_informative` + informative features, `n_redundant` redundant features, + `n_repeated` duplicated features and ``n_features-n_informative-n_redundant-n_repeated`` useless features drawn at random. n_informative : int, optional (default=2) The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices of a hypercube - in a subspace of dimension ``n_informative``. For each cluster, + in a subspace of dimension `n_informative`. For each cluster, informative features are drawn independently from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then placed on the vertices of the @@ -122,13 +124,13 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, The number of classes (or labels) of the classification problem. n_clusters_per_class : int, optional (default=2) The number of clusters per class. - weights : array-like of shape (n_classes,) or (n_classes - 1,),\ - (default=None) + weights : array-like of shape ``(n_classes,)`` or ``(n_classes - 1,)``, \ + (default=None) The proportions of samples assigned to each class. If None, then classes are balanced. Note that if ``len(weights) == n_classes - 1``, then the last class weight is automatically inferred. - More than ``n_samples`` samples may be returned if the sum of - ``weights`` exceeds 1. + More than `n_samples` samples may be returned if the sum of + `weights` exceeds 1. flip_y : float, optional (default=0.01) The fraction of samples whose class is assigned randomly. Larger values introduce noise in the labels and make the classification @@ -171,17 +173,18 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, ----- How we extended the dask MNMG version from the single GPU version: - 1. We generate centroids of shape (n_centroids, n_informative) - 2. We generate an informative covariance of shape - (n_centroids, n_informative, n_informative) - 3. We generate a redundant covariance of shape - (n_informative, n_redundant) - 4. We generate the indices for the repeated features - We pass along the references to the futures of the above arrays - with each part to the single GPU - `cuml.datasets.classification.make_classification` so that each - part (and worker) has access to the correct values to generate - data from the same covariances + 1. We generate centroids of shape ``(n_centroids, n_informative)`` + 2. We generate an informative covariance of shape \ + ``(n_centroids, n_informative, n_informative)`` + 3. We generate a redundant covariance of shape \ + ``(n_informative, n_redundant)`` + 4. We generate the indices for the repeated features \ + We pass along the references to the futures of the above arrays \ + with each part to the single GPU \ + `cuml.datasets.classification.make_classification` so that each \ + part (and worker) has access to the correct values to generate \ + data from the same covariances + """ client = get_client(client=client) diff --git a/python/cuml/dask/datasets/regression.py b/python/cuml/dask/datasets/regression.py index 874405ad44..4802b6b792 100644 --- a/python/cuml/dask/datasets/regression.py +++ b/python/cuml/dask/datasets/regression.py @@ -223,7 +223,7 @@ def make_low_rank_matrix(n_samples=100, n_features=100, tail_strength : float between 0.0 and 1.0, optional (default=0.5) The relative importance of the fat noisy tail of the singular values profile. - random_state : int, CuPy RandomState instance, Dask RandomState instance + random_state : int, CuPy RandomState instance, Dask RandomState instance \ or None (default) Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. @@ -236,6 +236,7 @@ def make_low_rank_matrix(n_samples=100, n_features=100, ------- X : Dask-CuPy array of shape [n_samples, n_features] The matrix. + """ rs = _create_rs_generator(random_state) @@ -276,7 +277,9 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, random_state=None, n_parts=1, n_samples_per_part=None, order='F', dtype='float32', client=None, use_full_low_rank=True): - """Generate a random regression problem. + """ + Generate a random regression problem. + The input set can either be well conditioned (by default) or have a low rank-fat tail singular profile. @@ -305,9 +308,11 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, of the input data by linear combinations. Using this kind of singular spectrum in the input allows the generator to reproduce the correlations often observed in practice. + if None: The input set is well conditioned, centered and gaussian with unit variance. + tail_strength : float between 0.0 and 1.0, optional (default=0.5) The relative importance of the fat noisy tail of the singular values profile if "effective_rank" is not None. @@ -317,7 +322,7 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, Shuffle the samples and the features. coef : boolean, optional (default=False) If True, the coefficients of the underlying linear model are returned. - random_state : int, CuPy RandomState instance, Dask RandomState instance + random_state : int, CuPy RandomState instance, Dask RandomState instance \ or None (default) Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. @@ -339,26 +344,26 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, The input samples. y : Dask-CuPy array of shape [n_samples] or [n_samples, n_targets] The output values. - coef : Dask-CuPy array of shape [n_features] + coef : Dask-CuPy array of shape [n_features] \ or [n_features, n_targets], optional The coefficient of the underlying linear model. It is returned only if coef is True. Notes ----- - - Known Performance Limitations: - 1. When `effective_rank` is set and `use_full_low_rank` is True, - we cannot generate order `F` by construction, and an explicit - transpose is performed on each part. This may cause memory to spike - (other parameters make order `F` by construction) - 2. When `n_targets > 1` and `order = 'F'` as above, we have to - explicity transpose the `y` array. If `coef = True`, then we also - explicity transpose the `ground_truth` array - 3. When `shuffle = True` and `order = F`, there are memory spikes to - shuffle the `F` order arrays - - - NOTE: If out-of-memory errors are encountered in any of the above \ - configurations, try increasing the `n_parts` parameter. + Known Performance Limitations: + 1. When `effective_rank` is set and `use_full_low_rank` is True, \ + we cannot generate order `F` by construction, and an explicit \ + transpose is performed on each part. This may cause memory to spike \ + (other parameters make order `F` by construction) + 2. When `n_targets > 1` and `order = 'F'` as above, we have to \ + explicity transpose the `y` array. If `coef = True`, then we also \ + explicity transpose the `ground_truth` array + 3. When `shuffle = True` and `order = F`, there are memory spikes to \ + shuffle the `F` order arrays + + .. note:: If out-of-memory errors are encountered in any of the above + configurations, try increasing the `n_parts` parameter. """ client = get_client(client=client) diff --git a/python/cuml/dask/decomposition/pca.py b/python/cuml/dask/decomposition/pca.py index 171a5e3d82..3c128e1831 100644 --- a/python/cuml/dask/decomposition/pca.py +++ b/python/cuml/dask/decomposition/pca.py @@ -37,7 +37,7 @@ class PCA(BaseDecomposition, then selects the top K eigenvectors. Examples - --------- + -------- .. code-block:: python @@ -92,8 +92,8 @@ class PCA(BaseDecomposition, 1 0.011454 2 -0.008182 - Note: Everytime this code is run, the output will be different because - "make_blobs" function generates random matrices. + .. note:: Everytime this code is run, the output will be different because + "make_blobs" function generates random matrices. Parameters ---------- diff --git a/python/cuml/dask/decomposition/tsvd.py b/python/cuml/dask/decomposition/tsvd.py index 2915bd4b8a..c823d33b23 100644 --- a/python/cuml/dask/decomposition/tsvd.py +++ b/python/cuml/dask/decomposition/tsvd.py @@ -27,7 +27,7 @@ class TruncatedSVD(BaseDecomposition, DecompositionSyncFitMixin): """ Examples - --------- + -------- .. code-block:: python @@ -64,25 +64,26 @@ class TruncatedSVD(BaseDecomposition, .. code-block:: python - Input Matrix: - 0 1 2 - 0 -8.519647 -8.519222 -8.865648 - 1 -6.107700 -8.350124 -10.351215 - 2 -8.026635 -9.442240 -7.561770 - 0 -8.519647 -8.519222 -8.865648 - 1 -6.107700 -8.350124 -10.351215 - 2 -8.026635 -9.442240 -7.561770 - - Transformed Input Matrix: - 0 - 0 14.928891 - 1 14.487295 - 2 14.431235 - 0 14.928891 - 1 14.487295 - 2 14.431235 - Note: Everytime this code is run, the output will be different because - "make_blobs" function generates random matrices. + Input Matrix: + 0 1 2 + 0 -8.519647 -8.519222 -8.865648 + 1 -6.107700 -8.350124 -10.351215 + 2 -8.026635 -9.442240 -7.561770 + 0 -8.519647 -8.519222 -8.865648 + 1 -6.107700 -8.350124 -10.351215 + 2 -8.026635 -9.442240 -7.561770 + + Transformed Input Matrix: + 0 + 0 14.928891 + 1 14.487295 + 2 14.431235 + 0 14.928891 + 1 14.487295 + 2 14.431235 + + .. note:: Everytime this code is run, the output will be different because + "make_blobs" function generates random matrices. Parameters ---------- @@ -107,6 +108,7 @@ class TruncatedSVD(BaseDecomposition, How much in % the variance is explained given by S**2/sum(S**2) singular_values_ : array The top K singular values. Remember all singular values >= 0 + """ def __init__(self, client=None, **kwargs): @@ -151,9 +153,9 @@ def fit_transform(self, X): def transform(self, X, delayed=True): """ - Apply dimensionality reduction to X. + Apply dimensionality reduction to `X`. - X is projected on the first principal components previously extracted + `X` is projected on the first principal components previously extracted from a training set. Parameters diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index b5388db070..f4ecdf1e89 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -36,12 +36,12 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, (possibly on different nodes). Currently, this API makes the following assumptions: - * The set of Dask workers used between instantiation, fit, - and predict are all consistent - * Training data comes in the form of cuDF dataframes or Dask Arrays - distributed so that each worker has at least one partition. - * The print_summary and print_detailed functions print the - information of the forest on the worker. + * The set of Dask workers used between instantiation, fit, \ + and predict are all consistent + * Training data comes in the form of cuDF dataframes or Dask Arrays \ + distributed so that each worker has at least one partition. + * The print_summary and print_detailed functions print the \ + information of the forest on the worker. Future versions of the API will support more flexible data distribution and additional input types. @@ -70,8 +70,7 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, 0 for GINI, 1 for ENTROPY, 4 for CRITERION_END. 2 and 3 not valid for classification (default = 0) - split_algo : 0 for HIST and 1 for GLOBAL_QUANTILE - (default = 1) + split_algo : 0 for HIST and 1 for GLOBAL_QUANTILE (default = 1) the algorithm to determine how nodes are split in the tree. split_criterion : The criterion used to split nodes. 0 for GINI, 1 for ENTROPY, 4 for CRITERION_END. @@ -111,7 +110,7 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, not currently fully guarantee the exact same results. Examples - --------- + -------- For usage examples, please see the RAPIDS notebooks repository: https://github.com/rapidsai/notebooks/blob/branch-0.12/cuml/random_forest_mnmg_demo.ipynb """ @@ -182,7 +181,9 @@ def fit(self, X, y, convert_dtype=False): memory consumption, ensure that each worker has exactly one partition. When persisting data, you can use - cuml.dask.common.utils.persist_across_workers to simplify this:: + `cuml.dask.common.utils.persist_across_workers` to simplify this: + + .. code-block:: python X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers) y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers) @@ -190,7 +191,10 @@ def fit(self, X, y, convert_dtype=False): [X_dask_cudf, y_dask_cudf]) - (this is equivalent to calling `persist` with the data and workers):: + This is equivalent to calling `persist` with the data and workers: + + .. code-block:: python + X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf, y_dask_cudf], workers={ @@ -265,7 +269,7 @@ def predict(self, X, output_class=True, algo='auto', threshold=0.5, coalescing-friendly 'batch_tree_reorg' - similar to tree_reorg but predicting multiple rows per thread block - `algo` - choose the algorithm automatically. Currently + 'algo' - choose the algorithm automatically. Currently 'batch_tree_reorg' is used for dense storage and 'naive' for sparse storage threshold : float (default = 0.5) @@ -391,7 +395,7 @@ def predict_proba(self, X, """ Predicts the probability of each class for X. - See documentation of `predict' for notes on performance. + See documentation of `predict` for notes on performance. Parameters ---------- @@ -417,7 +421,7 @@ def predict_proba(self, X, coalescing-friendly 'batch_tree_reorg' - similar to tree_reorg but predicting multiple rows per thread block - `auto` - choose the algorithm automatically. Currently + 'auto' - choose the algorithm automatically. Currently 'batch_tree_reorg' is used for dense storage and 'naive' for sparse storage threshold : float (default = 0.5) @@ -439,9 +443,10 @@ def predict_proba(self, X, or algo='auto' Returns - ---------- + ------- y : NumPy Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_classes) + """ if self._get_internal_model() is None: self._set_internal_model(self._concat_treelite_models()) diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index fa35fa90bd..716628a036 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -30,12 +30,12 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, (possibly on different nodes). Currently, this API makes the following assumptions: - * The set of Dask workers used between instantiation, fit, - and predict are all consistent - * Training data comes in the form of cuDF dataframes or Dask Arrays - distributed so that each worker has at least one partition. - * The print_summary and print_detailed functions print the - information of the forest on the worker. + * The set of Dask workers used between instantiation, fit, + and predict are all consistent + * Training data comes in the form of cuDF dataframes or Dask Arrays + distributed so that each worker has at least one partition. + * The print_summary and print_detailed functions print the + information of the forest on the worker. Future versions of the API will support more flexible data distribution and additional input types. User-facing APIs are @@ -174,7 +174,9 @@ def fit(self, X, y, convert_dtype=False): on each Dask worker being used by the forest (self.workers). When persisting data, you can use - cuml.dask.common.utils.persist_across_workers to simplify this:: + `cuml.dask.common.utils.persist_across_workers` to simplify this: + + .. code-block:: python X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers) y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers) @@ -182,7 +184,10 @@ def fit(self, X, y, convert_dtype=False): [X_dask_cudf, y_dask_cudf]) - (this is equivalent to calling `persist` with the data and workers):: + This is equivalent to calling `persist` with the data and workers): + + .. code-block:: python + X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf, y_dask_cudf], workers={ @@ -202,6 +207,7 @@ def fit(self, X, y, convert_dtype=False): When set to True, the fit method will, when necessary, convert y to be the same data type as X if they differ. This will increase memory used for the method. + """ self.internal_model = None self._fit(model=self.rfs, @@ -274,8 +280,9 @@ def predict(self, X, predict_model="GPU", algo='auto', eagerly executed one. Returns - ---------- - y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) + ------- + y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) + """ if predict_model == "CPU": preds = self.predict_model_on_cpu(X, convert_dtype=convert_dtype) diff --git a/python/cuml/dask/linear_model/elastic_net.py b/python/cuml/dask/linear_model/elastic_net.py index 460c8a9e7c..46871849ef 100644 --- a/python/cuml/dask/linear_model/elastic_net.py +++ b/python/cuml/dask/linear_model/elastic_net.py @@ -82,7 +82,7 @@ class ElasticNet(BaseEstimator): coef_ : array, shape (n_features) The estimated coefficients for the linear regression model. intercept_ : array - The independent term. If fit_intercept_ is False, will be 0. + The independent term. If `fit_intercept_` is False, will be 0. For additional docs, see `scikitlearn's ElasticNet diff --git a/python/cuml/dask/linear_model/lasso.py b/python/cuml/dask/linear_model/lasso.py index a35373fb44..27a6197ba0 100644 --- a/python/cuml/dask/linear_model/lasso.py +++ b/python/cuml/dask/linear_model/lasso.py @@ -67,7 +67,7 @@ class Lasso(BaseEstimator): coef_ : array, shape (n_features) The estimated coefficients for the linear regression model. intercept_ : array - The independent term. If fit_intercept_ is False, will be 0. + The independent term. If `fit_intercept_` is False, will be 0. For additional docs, see `scikitlearn's Lasso `_. diff --git a/python/cuml/dask/linear_model/linear_regression.py b/python/cuml/dask/linear_model/linear_regression.py index 1b8be723d8..4ec3e9fd99 100644 --- a/python/cuml/dask/linear_model/linear_regression.py +++ b/python/cuml/dask/linear_model/linear_regression.py @@ -59,7 +59,7 @@ class LinearRegression(BaseEstimator, coef_ : cuDF series, shape (n_features) The estimated coefficients for the linear regression model. intercept_ : array - The independent term. If fit_intercept_ is False, will be 0. + The independent term. If `fit_intercept_` is False, will be 0. """ def __init__(self, client=None, verbose=False, **kwargs): diff --git a/python/cuml/dask/linear_model/ridge.py b/python/cuml/dask/linear_model/ridge.py index 28335d952c..9c0ccbd920 100644 --- a/python/cuml/dask/linear_model/ridge.py +++ b/python/cuml/dask/linear_model/ridge.py @@ -65,7 +65,8 @@ class Ridge(BaseEstimator, coef_ : array, shape (n_features) The estimated coefficients for the linear regression model. intercept_ : array - The independent term. If fit_intercept_ is False, will be 0. + The independent term. If `fit_intercept_` is False, will be 0. + """ def __init__(self, client=None, verbose=False, **kwargs): diff --git a/python/cuml/dask/manifold/umap.py b/python/cuml/dask/manifold/umap.py index 77bdf7df1e..0cb6d554f7 100644 --- a/python/cuml/dask/manifold/umap.py +++ b/python/cuml/dask/manifold/umap.py @@ -19,15 +19,16 @@ class UMAP(BaseEstimator, DelayedTransformMixin): - """ + r""" Uniform Manifold Approximation and Projection + Finds a low dimensional embedding of the data that approximates an underlying manifold. Adapted from https://github.com/lmcinnes/umap/blob/master/umap/umap_.py Examples - ---------- + -------- .. code-block:: python @@ -59,34 +60,35 @@ class UMAP(BaseEstimator, distributed_model = MNMG_UMAP(local_model) embedding = distributed_model.transform(X) - Note: Everytime this code is run, the output will be different because + .. note:: Everytime this code is run, the output will be different because "make_blobs" function generates random matrices. Notes ----- - This module is heavily based on Leland McInnes' reference UMAP package. + This module is heavily based on Leland McInnes' reference UMAP package [1]_. However, there are a number of differences and features that are - not yet implemented in cuml.umap: - * Using a non-Euclidean distance metric (support for a fixed set - of non-Euclidean metrics is planned for an upcoming release). - * Using a pre-computed pairwise distance matrix (under consideration - for future releases) - * Manual initialization of initial embedding positions + not yet implemented in `cuml.umap`: + * Using a non-Euclidean distance metric (support for a fixed set + of non-Euclidean metrics is planned for an upcoming release). + * Using a pre-computed pairwise distance matrix (under consideration + for future releases) + * Manual initialization of initial embedding positions In addition to these missing features, you should expect to see - the final embeddings differing between cuml.umap and the reference + the final embeddings differing between `cuml.umap` and the reference UMAP. In particular, the reference UMAP uses an approximate kNN algorithm for large data sizes while cuml.umap always uses exact kNN. - Known issue: If a UMAP model has not yet been fit, it cannot be pickled + **Known issue:** If a UMAP model has not yet been fit, it cannot be pickled References ---------- - * Leland McInnes, John Healy, James Melville - UMAP: Uniform Manifold Approximation and Projection for Dimension - Reduction - https://arxiv.org/abs/1802.03426 + + .. [1] `Leland McInnes, John Healy, James Melville + UMAP: Uniform Manifold Approximation and Projection for Dimension + Reduction. + `_ """ def __init__(self, model, client=None, **kwargs): @@ -95,7 +97,7 @@ def __init__(self, model, client=None, **kwargs): self._set_internal_model(model) def transform(self, X, convert_dtype=True): - """ + r""" Transform X into the existing embedded space and return that transformed output. diff --git a/python/cuml/datasets/arima.pyx b/python/cuml/datasets/arima.pyx index 8d6f10de2b..3972fd5027 100644 --- a/python/cuml/datasets/arima.pyx +++ b/python/cuml/datasets/arima.pyx @@ -73,8 +73,8 @@ def make_arima(batch_size=1000, n_obs=100, order=(1, 1, 1), r"""Generates a dataset of time series by simulating an ARIMA process of a given order. - Example - ------- + Examples + -------- .. code-block:: python from cuml.datasets import make_arima @@ -102,7 +102,7 @@ def make_arima(batch_size=1000, n_obs=100, order=(1, 1, 1), handle: cuml.Handle If it is None, a new one is created just for this function call - Returns: + Returns -------- out: array-like, shape (n_obs, batch_size) Array of the requested type containing the generated dataset diff --git a/python/cuml/datasets/regression.pyx b/python/cuml/datasets/regression.pyx index b2adde56a6..b2c929fffc 100644 --- a/python/cuml/datasets/regression.pyx +++ b/python/cuml/datasets/regression.pyx @@ -80,8 +80,8 @@ def make_regression(n_samples=100, n_features=2, n_informative=2, n_targets=1, See https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html - Example - ------- + Examples + -------- .. code-block:: python diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx index 64f6c3facb..238afed022 100644 --- a/python/cuml/decomposition/pca.pyx +++ b/python/cuml/decomposition/pca.pyx @@ -122,7 +122,7 @@ class PCA(Base): less accurate. Examples - --------- + -------- .. code-block:: python diff --git a/python/cuml/decomposition/tsvd.pyx b/python/cuml/decomposition/tsvd.pyx index 3534ca07e5..2da71861e3 100644 --- a/python/cuml/decomposition/tsvd.pyx +++ b/python/cuml/decomposition/tsvd.pyx @@ -115,7 +115,7 @@ class TruncatedSVD(Base): might be less accurate. Examples - --------- + -------- .. code-block:: python @@ -221,14 +221,15 @@ class TruncatedSVD(Base): **Applications of TruncatedSVD** - TruncatedSVD is also known as Latent Semantic Indexing (LSI) which - tries to find topics of a word count matrix. If X previously was - centered with mean removal, TruncatedSVD is the same as TruncatedPCA. - TruncatedSVD is also used in information retrieval tasks, - recommendation systems and data compression. + TruncatedSVD is also known as Latent Semantic Indexing (LSI) which + tries to find topics of a word count matrix. If X previously was + centered with mean removal, TruncatedSVD is the same as TruncatedPCA. + TruncatedSVD is also used in information retrieval tasks, + recommendation systems and data compression. For additional documentation, see `scikitlearn's TruncatedSVD docs `_. + """ def __init__(self, algorithm='full', handle=None, n_components=1, @@ -449,12 +450,14 @@ class TruncatedSVD(Base): def transform(self, X, convert_dtype=False): """ Perform dimensionality reduction on X. + Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Dense matrix (floats or doubles) of shape (n_samples, n_features). Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy + convert_dtype : bool, optional (default = False) When set to True, the transform method will automatically convert the input to the data type which was used to train the @@ -464,6 +467,7 @@ class TruncatedSVD(Base): ------- X_new : cuDF DataFrame, shape (n_samples, n_components) Reduced version of X. This will always be a dense DataFrame. + """ input, n_rows, _, dtype = \ input_to_cuml_array(X, check_dtype=self.dtype, diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index ee527b7dd2..7df4340b2c 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -144,7 +144,7 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin): reduce memory consumption. Examples - --------- + -------- .. code-block:: python import numpy as np @@ -332,6 +332,7 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin): Parameters ---------- + output_class : boolean (default = True) This is optional and required only while performing the predict operation on the GPU. @@ -364,10 +365,12 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin): or algo='auto' Returns - ---------- - fil_model : + ------- + + fil_model A Forest Inference model which can be used to perform inferencing on the random forest model. + """ treelite_handle = self._obtain_treelite_handle() return _obtain_fil_model(treelite_handle=treelite_handle, diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 18351f8150..c86b32c821 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -110,43 +110,45 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): """ Implements a Random Forest regressor model which fits multiple decision trees in an ensemble. - Note that the underlying algorithm for tree node splits differs from that - used in scikit-learn. By default, the cuML Random Forest uses a - histogram-based algorithm to determine splits, rather than an exact - count. You can tune the size of the histograms with the n_bins parameter. + + .. note:: that the underlying algorithm for tree node splits differs from that + used in scikit-learn. By default, the cuML Random Forest uses a + histogram-based algorithm to determine splits, rather than an exact + count. You can tune the size of the histograms with the n_bins parameter. **Known Limitations**: This is an early release of the cuML Random Forest code. It contains a few known limitations: - * GPU-based inference is only supported if the model was trained - with 32-bit (float32) datatypes. CPU-based inference may be used - in this case as a slower fallback. - * Very deep / very wide models may exhaust available GPU memory. - Future versions of cuML will provide an alternative algorithm to - reduce memory consumption. + * GPU-based inference is only supported if the model was trained + with 32-bit (float32) datatypes. CPU-based inference may be used + in this case as a slower fallback. + * Very deep / very wide models may exhaust available GPU memory. + Future versions of cuML will provide an alternative algorithm to + reduce memory consumption. Examples - --------- + -------- + .. code-block:: python - import numpy as np - from cuml.test.utils import get_handle - from cuml.ensemble import RandomForestRegressor as curfc - from cuml.test.utils import get_handle - X = np.asarray([[0,10],[0,20],[0,30],[0,40]], dtype=np.float32) - y = np.asarray([0.0,1.0,2.0,3.0], dtype=np.float32) - cuml_model = curfc(max_features=1.0, n_bins=8, - split_algo=0, min_rows_per_node=2, - n_estimators=40, accuracy_metric='mse') - cuml_model.fit(X,y) - cuml_score = cuml_model.score(X,y) - print("MSE score of cuml : ", cuml_score) + import numpy as np + from cuml.test.utils import get_handle + from cuml.ensemble import RandomForestRegressor as curfc + from cuml.test.utils import get_handle + X = np.asarray([[0,10],[0,20],[0,30],[0,40]], dtype=np.float32) + y = np.asarray([0.0,1.0,2.0,3.0], dtype=np.float32) + cuml_model = curfc(max_features=1.0, n_bins=8, + split_algo=0, min_rows_per_node=2, + n_estimators=40, accuracy_metric='mse') + cuml_model.fit(X,y) + cuml_score = cuml_model.score(X,y) + print("MSE score of cuml : ", cuml_score) Output: .. code-block:: python - MSE score of cuml : 0.1123437201231765 + MSE score of cuml : 0.1123437201231765 Parameters ----------- @@ -165,7 +167,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): 2 for MSE, or 3 for MAE 0 and 1 not valid for regression bootstrap : boolean (default = True) - Control bootstrapping. + Control bootstrapping. If True, each tree in the forest is built on a bootstrapped sample with replacement. If False, sampling without replacement is done. @@ -182,7 +184,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): max_leaves : int (default = -1) Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1. - max_features : int, float, or string (default = 'auto') + max_features : int, float, or string (default = 'auto') Ratio of number of features (columns) to consider per node split. If int then max_features/n_features. @@ -315,6 +317,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): """ Create a Forest Inference (FIL) model from the trained cuML Random Forest model. + Parameters ---------- output_class : boolean (default = False) @@ -323,6 +326,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): If true, return a 1 or 0 depending on whether the raw prediction exceeds the threshold. If False, just return the raw prediction. + algo : string (default = 'auto') This is optional and required only while performing the predict operation on the GPU. @@ -334,6 +338,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): `auto` - choose the algorithm automatically. Currently 'batch_tree_reorg' is used for dense storage and 'naive' for sparse storage + fil_sparse_format : boolean or string (default = 'auto') This variable is used to choose the type of forest that will be created in the Forest Inference Library. It is not required @@ -343,11 +348,14 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): False - create a dense forest True - create a sparse forest, requires algo='naive' or algo='auto' + Returns - ---------- - fil_model : + ------- + + fil_model A Forest Inference model which can be used to perform inferencing on the random forest model. + """ treelite_handle = self._obtain_treelite_handle() return _obtain_fil_model(treelite_handle=treelite_handle, diff --git a/python/cuml/feature_extraction/_tfidf_vectorizer.py b/python/cuml/feature_extraction/_tfidf_vectorizer.py index 54fb28a525..c774ae11dc 100644 --- a/python/cuml/feature_extraction/_tfidf_vectorizer.py +++ b/python/cuml/feature_extraction/_tfidf_vectorizer.py @@ -94,10 +94,11 @@ class TfidfVectorizer(CountVectorizer): Typically the delimiting character between words is a good choice. norm : {'l1', 'l2'}, default='l2' Each output row will have unit norm, either: - * 'l2': Sum of squares of vector elements is 1. The cosine - similarity between two vectors is their dot product when l2 norm has - been applied. - * 'l1': Sum of absolute values of vector elements is 1. + * 'l2': Sum of squares of vector elements is 1. The cosine \ + similarity between two vectors is their dot product when l2 norm has \ + been applied. + * 'l1': Sum of absolute values of vector elements is 1. + use_idf : bool, default=True Enable inverse-document-frequency reweighting. smooth_idf : bool, default=True @@ -119,6 +120,7 @@ class TfidfVectorizer(CountVectorizer): - occurred in too many documents (`max_df`) - occurred in too few documents (`min_df`) - were cut off by feature selection (`max_features`). + This is only available if no vocabulary was given. Notes diff --git a/python/cuml/feature_extraction/_vectorizers.py b/python/cuml/feature_extraction/_vectorizers.py index ef7d03bfe8..9fc2f01b12 100644 --- a/python/cuml/feature_extraction/_vectorizers.py +++ b/python/cuml/feature_extraction/_vectorizers.py @@ -373,7 +373,9 @@ class CountVectorizer(_VectorizerMixin): - occurred in too many documents (`max_df`) - occurred in too few documents (`min_df`) - were cut off by feature selection (`max_features`). + This is only available if no vocabulary was given. + """ def __init__(self, input=None, encoding=None, decode_error=None, @@ -456,7 +458,7 @@ def _limit_features(self, count_df, vocab, high, low, limit): documents than low, modifying the vocabulary, and restricting it to at most the limit most frequent. - Sets self.vocabulary_ and self.stop_words_ with the new values. + Sets `self.vocabulary_` and `self.stop_words_` with the new values. """ if high is None and low is None and limit is None: self.stop_words_ = None @@ -499,15 +501,17 @@ def fit(self, raw_documents): """ Build a vocabulary of all tokens in the raw documents. - Parameters - ---------- - raw_documents : cudf.Series - A Series of string documents + Parameters + ---------- - Returns - ------- - self - """ + raw_documents : cudf.Series + A Series of string documents + + Returns + ------- + self + + """ self.fit_transform(raw_documents) return self @@ -515,7 +519,7 @@ def fit_transform(self, raw_documents): """ Build the vocabulary and return document-term matrix. - Equivalent to .fit(X).transform(X) but preprocess X only once. + Equivalent to ``self.fit(X).transform(X)`` but preprocess `X` only once. Parameters ---------- @@ -624,10 +628,13 @@ def inverse_transform(self, X): def get_feature_names(self): """ Array mapping from feature integer indices to feature name. + Returns ------- + feature_names : Series A list of feature names. + """ return self.vocabulary_ @@ -646,24 +653,24 @@ class HashingVectorizer(_VectorizerMixin): This strategy has several advantages: - - it is very low memory scalable to large datasets as there is no need to - store a vocabulary dictionary in memory which is even more important - as GPU's that are often memory constrained - - it is fast to pickle and un-pickle as it holds no state besides the - constructor parameters - - it can be used in a streaming (partial fit) or parallel pipeline as there - is no state computed during fit. + - it is very low memory scalable to large datasets as there is no need to\ + store a vocabulary dictionary in memory which is even more important \ + as GPU's that are often memory constrained + - it is fast to pickle and un-pickle as it holds no state besides the \ + constructor parameters + - it can be used in a streaming (partial fit) or parallel pipeline as there \ + is no state computed during fit. There are also a couple of cons (vs using a CountVectorizer with an in-memory vocabulary): - - there is no way to compute the inverse transform (from feature indices to - string feature names) which can be a problem when trying to introspect - which features are most important to a model. - - there can be collisions: distinct tokens can be mapped to the same - feature index. However in practice this is rarely an issue if n_features - is large enough (e.g. 2 ** 18 for text classification problems). - - no IDF weighting as this would render the transformer stateful. + - there is no way to compute the inverse transform (from feature indices to \ + string feature names) which can be a problem when trying to introspect \ + which features are most important to a model. + - there can be collisions: distinct tokens can be mapped to the same \ + feature index. However in practice this is rarely an issue if n_features \ + is large enough (e.g. 2 ** 18 for text classification problems). + - no IDF weighting as this would render the transformer stateful. The hash function employed is the signed 32-bit version of Murmurhash3. @@ -709,7 +716,7 @@ class HashingVectorizer(_VectorizerMixin): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). delimiter : str, whitespace by default - String used as a replacement for stop words if stop_words is not None. + String used as a replacement for stop words if `stop_words` is not None. Typically the delimiting character between words is a good choice. Examples diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 647193758f..9fda25a9d1 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -68,7 +68,7 @@ cdef class TreeliteModel(): """ Wrapper for Treelite-loaded forest - Note: This is only used for loading saved models into ForestInference, + .. note:: This is only used for loading saved models into ForestInference, it does not actually perform inference. Users typically do not need to access TreeliteModel instances directly. @@ -251,8 +251,10 @@ cdef class ForestInference_impl(): matches sklearn Returns - ---------- + ------- + Predicted results of type as defined by the output_type variable + """ if (not self.output_class) and predict_proba: raise NotImplementedError("Predict_proba function is not available" @@ -377,7 +379,8 @@ cdef class ForestInference_impl(): class ForestInference(Base): - """ForestInference provides GPU-accelerated inference (prediction) + """ + ForestInference provides GPU-accelerated inference (prediction) for random forest and boosted decision tree models. This module does not support training models. Rather, users should @@ -438,6 +441,7 @@ class ForestInference(Base): https://github.com/rapidsai/cuml/blob/branch-0.14/notebooks/forest_inference_demo.ipynb """ + def __init__(self, handle=None, output_type=None): super(ForestInference, self).__init__(handle, @@ -507,37 +511,37 @@ class ForestInference(Base): Parameters ---------- - model : the trained model information in the treelite format - loaded from a saved model using the treelite API - https://treelite.readthedocs.io/en/latest/treelite-api.html + model + the trained model information in the treelite format + loaded from a saved model using the treelite API + https://treelite.readthedocs.io/en/latest/treelite-api.html output_class: boolean (default=False) - For a Classification model output_class must be True. - For a Regression model output_class must be False. + For a Classification model output_class must be True. + For a Regression model output_class must be False. algo : string (default='auto') - name of the algo from (from algo_t enum) - 'AUTO' or 'auto' - choose the algorithm automatically; - currently 'BATCH_TREE_REORG' is used for dense storage, - and 'NAIVE' for sparse storage - 'NAIVE' or 'naive' - simple inference using shared memory - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees - rearranged to be more coalescing-friendly - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG - but predicting multiple rows - per thread block + name of the algo from (from algo_t enum) : + - 'AUTO' or 'auto' - choose the algorithm automatically; \ + currently 'BATCH_TREE_REORG' is used for dense storage, \ + and 'NAIVE' for sparse storage + - 'NAIVE' or 'naive' - simple inference using shared memory + - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees \ + rearranged to be more coalescing-friendly + - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG \ + but predicting multiple rows per thread block threshold : float (default=0.5) Threshold is used to for classification. It is applied only if output_class == True, else it is ignored. storage_type : string or boolean (default='auto') - In-memory storage format to be used for the FIL model. - 'auto' - choose the storage type automatically - (currently DENSE is always used) - False - create a dense forest - True - create a sparse forest; - requires algo='NAIVE' or algo='AUTO' + In-memory storage format to be used for the FIL model: + - 'auto' - choose the storage type automatically \ + (currently DENSE is always used) + - False - create a dense forest + - True - create a sparse forest; \ + requires algo='NAIVE' or algo='AUTO' Returns ---------- - fil_model : + fil_model A Forest Inference model which can be used to perform inferencing on the random forest/ XGBoost model. """ @@ -564,35 +568,35 @@ class ForestInference(Base): Parameters ---------- - skl_model : The scikit-learn model from which to build the FIL version. + skl_model + The scikit-learn model from which to build the FIL version. output_class: boolean (default=False) - For a Classification model output_class must be True. - For a Regression model output_class must be False. + For a Classification model output_class must be True. + For a Regression model output_class must be False. algo : string (default='auto') - name of the algo from (from algo_t enum) - 'AUTO' or 'auto' - choose the algorithm automatically; - currently 'BATCH_TREE_REORG' is used for dense storage, - and 'NAIVE' for sparse storage - 'NAIVE' or 'naive' - simple inference using shared memory - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees - rearranged to be more coalescing-friendly - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG - but predicting multiple rows - per thread block + name of the algo from (from algo_t enum): + - 'AUTO' or 'auto' - choose the algorithm automatically; \ + currently 'BATCH_TREE_REORG' is used for dense storage, \ + and 'NAIVE' for sparse storage + - 'NAIVE' or 'naive' - simple inference using shared memory + - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees \ + rearranged to be more coalescing-friendly + - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG \ + but predicting multiple rows per thread block threshold : float (default=0.5) Threshold is used to for classification. It is applied - only if output_class == True, else it is ignored. + only if ``output_class == True``, else it is ignored. storage_type : string or boolean (default='auto') - In-memory storage format to be used for the FIL model. - 'auto' - choose the storage type automatically + In-memory storage format to be used for the FIL model: + - 'auto' - choose the storage type automatically \ (currently DENSE is always used) - False - create a dense forest - True - create a sparse forest; + - False - create a dense forest + - True - create a sparse forest; \ requires algo='NAIVE' or algo='AUTO' Returns ---------- - fil_model : + fil_model A Forest Inference model created from the scikit-learn model passed. @@ -613,36 +617,37 @@ class ForestInference(Base): model_type="xgboost", handle=None): """ - Returns a FIL instance containing the forest saved in 'filename' + Returns a FIL instance containing the forest saved in `filename` This uses Treelite to load the saved model. Parameters ---------- filename : string - Path to saved model file in a treelite-compatible format - (See https://treelite.readthedocs.io/en/latest/treelite-api.html + Path to saved model file in a treelite-compatible format + (See https://treelite.readthedocs.io/en/latest/treelite-api.html for more information) output_class: boolean (default=False) - For a Classification model output_class must be True. - For a Regression model output_class must be False. + For a Classification model `output_class` must be True. + For a Regression model `output_class` must be False. threshold : float (default=0.5) - Cutoff value above which a prediction is set to 1.0 - Only used if the model is classification and output_class is True + Cutoff value above which a prediction is set to 1.0 + Only used if the model is classification and `output_class` is True algo : string (default='auto') - Which inference algorithm to use. - See documentation in FIL.load_from_treelite_model + Which inference algorithm to use. + See documentation in `FIL.load_from_treelite_model` storage_type : string (default='auto') In-memory storage format to be used for the FIL model. - See documentation in FIL.load_from_treelite_model + See documentation in `FIL.load_from_treelite_model` model_type : string (default="xgboost") Format of the saved treelite model to be load. It can be 'xgboost', 'lightgbm'. Returns ---------- - fil_model : + fil_model A Forest Inference model which can be used to perform inferencing on the model read from the file. + """ cuml_fm = ForestInference(handle=handle) tl_model = TreeliteModel.from_filename(filename, model_type=model_type) @@ -669,21 +674,21 @@ class ForestInference(Base): (See https://treelite.readthedocs.io/en/latest/treelite-api.html for more information) output_class: boolean (default=False) - For a Classification model output_class must be True. - For a Regression model output_class must be False. + For a Classification model `output_class` must be True. + For a Regression model `output_class` must be False. threshold : float (default=0.5) - Cutoff value above which a prediction is set to 1.0 - Only used if the model is classification and output_class is True + Cutoff value above which a prediction is set to 1.0 + Only used if the model is classification and output_class is True algo : string (default='auto') - Which inference algorithm to use. - See documentation in FIL.load_from_treelite_model + Which inference algorithm to use. + See documentation in `FIL.load_from_treelite_model` storage_type : string (default='auto') In-memory storage format to be used for the FIL model. - See documentation in FIL.load_from_treelite_model + See documentation in `FIL.load_from_treelite_model` Returns ---------- - fil_model : + fil_model A Forest Inference model which can be used to perform inferencing on the random forest model. """ diff --git a/python/cuml/linear_model/elastic_net.pyx b/python/cuml/linear_model/elastic_net.pyx index 66edadd6ee..82fd5a4443 100644 --- a/python/cuml/linear_model/elastic_net.pyx +++ b/python/cuml/linear_model/elastic_net.pyx @@ -36,7 +36,7 @@ class ElasticNet(Base, RegressorMixin): descent to fit a linear model. Examples - --------- + -------- .. code-block:: python diff --git a/python/cuml/linear_model/lasso.pyx b/python/cuml/linear_model/lasso.pyx index 4edcb773f6..392f11737e 100644 --- a/python/cuml/linear_model/lasso.pyx +++ b/python/cuml/linear_model/lasso.pyx @@ -37,7 +37,7 @@ class Lasso(Base, RegressorMixin): a linear model. Examples - --------- + -------- .. code-block:: python diff --git a/python/cuml/linear_model/linear_regression.pyx b/python/cuml/linear_model/linear_regression.pyx index 93b647711e..d4ceba39dc 100644 --- a/python/cuml/linear_model/linear_regression.pyx +++ b/python/cuml/linear_model/linear_regression.pyx @@ -86,7 +86,7 @@ class LinearRegression(Base, RegressorMixin): stable, but Eig (default) is much faster. Examples - --------- + -------- .. code-block:: python diff --git a/python/cuml/linear_model/logistic_regression.pyx b/python/cuml/linear_model/logistic_regression.pyx index d2d35d15de..328f90dd47 100644 --- a/python/cuml/linear_model/logistic_regression.pyx +++ b/python/cuml/linear_model/logistic_regression.pyx @@ -59,7 +59,7 @@ class LogisticRegression(Base, ClassifierMixin): Note that, just like in Scikit-learn, the bias will not be regularized. Examples - --------- + -------- .. code-block:: python import cudf diff --git a/python/cuml/linear_model/mbsgd_classifier.pyx b/python/cuml/linear_model/mbsgd_classifier.pyx index 8e06563682..88fc0a8ea4 100644 --- a/python/cuml/linear_model/mbsgd_classifier.pyx +++ b/python/cuml/linear_model/mbsgd_classifier.pyx @@ -28,7 +28,7 @@ class MBSGDClassifier(Base, ClassifierMixin): fitted by minimizing a regularized empirical loss with mini-batch SGD. Examples - --------- + -------- .. code-block:: python import numpy as np diff --git a/python/cuml/linear_model/mbsgd_regressor.pyx b/python/cuml/linear_model/mbsgd_regressor.pyx index 18f72c93f8..6658ed4382 100644 --- a/python/cuml/linear_model/mbsgd_regressor.pyx +++ b/python/cuml/linear_model/mbsgd_regressor.pyx @@ -28,7 +28,7 @@ class MBSGDRegressor(Base, RegressorMixin): regularized empirical loss with mini-batch SGD. Examples - --------- + -------- .. code-block:: python import numpy as np diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx index 4037f8bc93..1f6de4b6ef 100644 --- a/python/cuml/linear_model/ridge.pyx +++ b/python/cuml/linear_model/ridge.pyx @@ -98,7 +98,7 @@ class Ridge(Base, RegressorMixin): Coordinate Descent and can be faster when data is large. Examples - --------- + -------- .. code-block:: python diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index a91e111c51..308035b00f 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -128,7 +128,7 @@ cdef extern from "cuml/manifold/umap.hpp" namespace "ML": class UMAP(Base): - """Uniform Manifold Approximation and Projection + r"""Uniform Manifold Approximation and Projection Finds a low dimensional embedding of the data that approximates an underlying manifold. @@ -154,8 +154,9 @@ class UMAP(Base): The initial learning rate for the embedding optimization. init: string (optional, default 'spectral') How to initialize the low dimensional embedding. Options are: - * 'spectral': use a spectral embedding of the fuzzy 1-skeleton - * 'random': assign initial embedding positions at random. + * 'spectral': use a spectral embedding of the fuzzy 1-skeleton + * 'random': assign initial embedding positions at random. + min_dist: float (optional, default 0.1) The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points @@ -202,15 +203,16 @@ class UMAP(Base): More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``. - hash_input: UMAP can hash the training input so that exact embeddings - are returned when transform is called on the same data upon - which the model was trained. This enables consistent - behavior between calling model.fit_transform(X) and - calling model.fit(X).transform(X). Not that the CPU-based - UMAP reference implementation does this by default. This - feature is made optional in the GPU version due to the - significant overhead in copying memory to the host for - computing the hash. (default = False) + hash_input: bool, optional (default = False) + UMAP can hash the training input so that exact embeddings + are returned when transform is called on the same data upon + which the model was trained. This enables consistent + behavior between calling ``model.fit_transform(X)`` and + calling ``model.fit(X).transform(X)``. Not that the CPU-based + UMAP reference implementation does this by default. This + feature is made optional in the GPU version due to the + significant overhead in copying memory to the host for + computing the hash. random_state : int, RandomState instance or None, optional (default=None) random_state is the seed used by the random number generator during embedding initialization and during sampling used by the optimizer. @@ -228,19 +230,24 @@ class UMAP(Base): The optimization step will be processed with at most optim_batch_size edges at once preventing inconsistencies. A lower batch size will yield more consistently repeatable embeddings at the cost of speed. - callback: An instance of GraphBasedDimRedCallback class to intercept - the internal state of embeddings while they are being trained. - Example of callback usage: - from cuml.internals import GraphBasedDimRedCallback - class CustomCallback(GraphBasedDimRedCallback): - def on_preprocess_end(self, embeddings): - print(embeddings.copy_to_host()) - - def on_epoch_end(self, embeddings): - print(embeddings.copy_to_host()) - - def on_train_end(self, embeddings): - print(embeddings.copy_to_host()) + callback: An instance of GraphBasedDimRedCallback class + Used to intercept the internal state of embeddings while they are being trained. + Example of callback usage: + + .. code-block:: python + + from cuml.internals import GraphBasedDimRedCallback + + class CustomCallback(GraphBasedDimRedCallback): + def on_preprocess_end(self, embeddings): + print(embeddings.copy_to_host()) + + def on_epoch_end(self, embeddings): + print(embeddings.copy_to_host()) + + def on_train_end(self, embeddings): + print(embeddings.copy_to_host()) + verbose : int or boolean (default = False) Controls verbosity of logging. @@ -249,11 +256,11 @@ class UMAP(Base): This module is heavily based on Leland McInnes' reference UMAP package. However, there are a number of differences and features that are not yet implemented in cuml.umap: - * Using a non-Euclidean distance metric (support for a fixed set - of non-Euclidean metrics is planned for an upcoming release). - * Using a pre-computed pairwise distance matrix (under consideration - for future releases) - * Manual initialization of initial embedding positions + * Using a non-Euclidean distance metric (support for a fixed set + of non-Euclidean metrics is planned for an upcoming release). + * Using a pre-computed pairwise distance matrix (under consideration + for future releases) + * Manual initialization of initial embedding positions In addition to these missing features, you should expect to see the final embeddings differing between cuml.umap and the reference @@ -261,15 +268,15 @@ class UMAP(Base): algorithm for large data sizes while cuml.umap always uses exact kNN. - Known issue: If a UMAP model has not yet been fit, it cannot be pickled. + **Known issue:** If a UMAP model has not yet been fit, it cannot be pickled. However, after fitting, a UMAP mode. References ---------- - * Leland McInnes, John Healy, James Melville - UMAP: Uniform Manifold Approximation and Projection for Dimension - Reduction - https://arxiv.org/abs/1802.03426 + .. [1] `Leland McInnes, John Healy, James Melville + UMAP: Uniform Manifold Approximation and Projection for Dimension + Reduction + `_ """ diff --git a/python/cuml/metrics/_classification.py b/python/cuml/metrics/_classification.py index b95b070550..ad24926590 100644 --- a/python/cuml/metrics/_classification.py +++ b/python/cuml/metrics/_classification.py @@ -46,6 +46,7 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): Returns ------- loss : float + Examples -------- >>> from cuml.metrics import log_loss @@ -53,13 +54,16 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): >>> log_loss(np.array([1, 0, 0, 1]), ... np.array([[.1, .9], [.9, .1], [.8, .2], [.35, .65]])) 0.21616... + References ---------- C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, p. 209. + Notes ----- The logarithm used is the natural logarithm (base-e). + """ y_true, n_rows, n_cols, ytype = \ input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64, diff --git a/python/cuml/metrics/_ranking.py b/python/cuml/metrics/_ranking.py index 4ef0f38bf9..f50d8fc7c0 100644 --- a/python/cuml/metrics/_ranking.py +++ b/python/cuml/metrics/_ranking.py @@ -26,20 +26,20 @@ def precision_recall_curve(y_true, probs_pred): """ Compute precision-recall pairs for different probability thresholds - Note: this implementation is restricted to the binary classification task. - The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of - true positives and ``fp`` the number of false positives. The precision is - intuitively the ability of the classifier not to label as positive a sample - that is negative. + .. note:: this implementation is restricted to the binary classification task. + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. - The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of - true positives and ``fn`` the number of false negatives. The recall is - intuitively the ability of the classifier to find all the positive samples. - The last precision and recall values are 1. and 0. respectively and do not - have a corresponding threshold. This ensures that the graph starts on the - y axis. + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + The last precision and recall values are 1. and 0. respectively and do not + have a corresponding threshold. This ensures that the graph starts on the + y axis. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -120,7 +120,7 @@ def roc_auc_score(y_true, y_score): Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. - Note: this implementation can only be used with binary classification. + .. note:: this implementation can only be used with binary classification. Parameters ---------- @@ -140,18 +140,13 @@ def roc_auc_score(y_true, y_score): Examples -------- - .. code-block:: python - - import numpy as np - from cuml.metrics import roc_auc_score - y_true = np.array([0, 0, 1, 1]) - y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - print(roc_auc_score(y_true, y_scores)) - - Output: - .. code-block:: python + >>> import numpy as np + >>> from cuml.metrics import roc_auc_score + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> print(roc_auc_score(y_true, y_scores)) + 0.75 - 0.75 """ y_true, n_rows, n_cols, ytype = \ input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64, diff --git a/python/cuml/metrics/pairwise_distances.pyx b/python/cuml/metrics/pairwise_distances.pyx index 3b77d12757..7c65171b79 100644 --- a/python/cuml/metrics/pairwise_distances.pyx +++ b/python/cuml/metrics/pairwise_distances.pyx @@ -160,7 +160,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None, array from `X` and the jth array from `Y`. Examples - --------- + -------- >>> import cupy as cp >>> from cuml.metrics import pairwise_distances >>> diff --git a/python/cuml/neighbors/kneighbors_classifier.pyx b/python/cuml/neighbors/kneighbors_classifier.pyx index 33a324b152..189b905348 100644 --- a/python/cuml/neighbors/kneighbors_classifier.pyx +++ b/python/cuml/neighbors/kneighbors_classifier.pyx @@ -99,7 +99,7 @@ class KNeighborsClassifier(NearestNeighbors, ClassifierMixin): supported. Examples - --------- + -------- .. code-block:: python from cuml.neighbors import KNeighborsClassifier @@ -121,7 +121,6 @@ class KNeighborsClassifier(NearestNeighbors, ClassifierMixin): Output: - ------- .. code-block:: python diff --git a/python/cuml/neighbors/kneighbors_regressor.pyx b/python/cuml/neighbors/kneighbors_regressor.pyx index 9b0637bd51..074b6cb5a2 100644 --- a/python/cuml/neighbors/kneighbors_regressor.pyx +++ b/python/cuml/neighbors/kneighbors_regressor.pyx @@ -101,7 +101,7 @@ class KNeighborsRegressor(NearestNeighbors, RegressorMixin): supported. Examples - --------- + -------- .. code-block:: python from cuml.neighbors import KNeighborsRegressor diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index 151d490ad5..5ea0dbbb49 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -124,7 +124,7 @@ class NearestNeighbors(Base): metric_params : dict, optional (default = None) This is currently ignored. Examples - --------- + -------- .. code-block:: python import cudf diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 0257d5d1ff..0b4db88b5b 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -37,8 +37,9 @@ class OneHotEncoder(Base): By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. - Note: a one-hot encoding of y labels should use a LabelBinarizer - instead. + + .. note:: a one-hot encoding of y labels should use a LabelBinarizer + instead. Parameters ---------- diff --git a/python/cuml/preprocessing/model_selection.py b/python/cuml/preprocessing/model_selection.py index 973e18d316..b171150183 100644 --- a/python/cuml/preprocessing/model_selection.py +++ b/python/cuml/preprocessing/model_selection.py @@ -68,14 +68,13 @@ def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state): class_counts = cp.bincount(y_indices) if n_train < n_classes: raise ValueError('The train_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (n_train, n_classes)) + 'equal to the number of classes = %d' % (n_train, + n_classes)) if n_test < n_classes: raise ValueError('The test_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (n_test, n_classes)) - class_indices = cp.array_split(cp.argsort(y_indices), - n_classes) + 'equal to the number of classes = %d' % (n_test, + n_classes)) + class_indices = cp.array_split(cp.argsort(y_indices), n_classes) X_train = None @@ -175,17 +174,20 @@ def _approximate_mode(class_counts, n_draws, rng): return floored.astype(cp.int) -def train_test_split( - X, - y=None, - test_size: Union[float, int] = None, - train_size: Union[float, int] = None, - shuffle: bool = True, - random_state: Union[int, cp.random.RandomState, - np.random.RandomState] = None, - seed: Union[int, cp.random.RandomState, np.random.RandomState] = None, - stratify=None -): +def train_test_split(X, + y=None, + test_size: Union[float, + int] = None, + train_size: Union[float, + int] = None, + shuffle: bool = True, + random_state: Union[int, + cp.random.RandomState, + np.random.RandomState] = None, + seed: Union[int, + cp.random.RandomState, + np.random.RandomState] = None, + stratify=None): """ Partitions device data into four collated objects, mimicking Scikit-learn's `train_test_split` @@ -215,6 +217,7 @@ def train_test_split( Examples -------- + .. code-block:: python import cudf @@ -253,11 +256,13 @@ def train_test_split( Returns ------- + X_train, X_test, y_train, y_test : cudf.DataFrame or array-like objects Partitioned dataframes if X and y were cuDF objects. If `y` was provided as a column name, the column was dropped from the `X`s Partitioned numba device arrays if X and y were Numba device arrays. Partitioned CuPy arrays for any other input. + """ if isinstance(y, str): # Use the column with name `str` as y @@ -284,10 +289,10 @@ def train_test_split( a cuda_array_interface compliant array.") if X.shape[0] != y.shape[0]: - raise ValueError( - "X and y must have the same first dimension" - "(found {} and {})".format(X.shape[0], y.shape[0]) - ) + raise ValueError("X and y must have the same first dimension" + "(found {} and {})".format( + X.shape[0], + y.shape[0])) else: if not hasattr(X, "__cuda_array_interface__") and not \ isinstance(X, cudf.DataFrame): @@ -296,31 +301,25 @@ def train_test_split( if isinstance(train_size, float): if not 0 <= train_size <= 1: - raise ValueError( - "proportion train_size should be between" - "0 and 1 (found {})".format(train_size) - ) + raise ValueError("proportion train_size should be between" + "0 and 1 (found {})".format(train_size)) if isinstance(train_size, int): if not 0 <= train_size <= X.shape[0]: raise ValueError( "Number of instances train_size should be between 0 and the" - "first dimension of X (found {})".format(train_size) - ) + "first dimension of X (found {})".format(train_size)) if isinstance(test_size, float): if not 0 <= test_size <= 1: - raise ValueError( - "proportion test_size should be between" - "0 and 1 (found {})".format(train_size) - ) + raise ValueError("proportion test_size should be between" + "0 and 1 (found {})".format(train_size)) if isinstance(test_size, int): if not 0 <= test_size <= X.shape[0]: raise ValueError( "Number of instances test_size should be between 0 and the" - "first dimension of X (found {})".format(test_size) - ) + "first dimension of X (found {})".format(test_size)) x_numba = cuda.devicearray.is_cuda_ndarray(X) y_numba = cuda.devicearray.is_cuda_ndarray(y) @@ -387,8 +386,13 @@ def train_test_split( y = cp.asarray(y)[idxs] if stratify is not None: - split_return = _stratify_split(X, y, train_size, test_size, - x_numba, y_numba, random_state) + split_return = _stratify_split(X, + y, + train_size, + test_size, + x_numba, + y_numba, + random_state) return split_return # If not stratified, perform train_test_split splicing diff --git a/python/cuml/random_projection/random_projection.pyx b/python/cuml/random_projection/random_projection.pyx index db486d4bf7..08613a8e7c 100644 --- a/python/cuml/random_projection/random_projection.pyx +++ b/python/cuml/random_projection/random_projection.pyx @@ -293,10 +293,11 @@ class GaussianRandomProjection(Base, BaseRandomProjection): The components of the random matrix are drawn from N(0, 1 / n_components). - Example - --------- + Examples + -------- .. code-block:: python + from cuml.random_projection import GaussianRandomProjection from sklearn.datasets.samples_generator import make_blobs from sklearn.svm import SVC @@ -324,6 +325,7 @@ class GaussianRandomProjection(Base, BaseRandomProjection): Output: .. code-block:: python + Score: 1.0 Parameters @@ -389,16 +391,17 @@ class SparseRandomProjection(Base, BaseRandomProjection): (e.g. Gaussian) that guarantees similar embedding quality while being much more memory efficient and allowing faster computation of the projected data (with sparse enough matrices). - If we note 's = 1 / density' the components of the random matrix are + If we note ``s = 1 / density`` the components of the random matrix are drawn from: - - -sqrt(s) / sqrt(n_components) with probability 1 / 2s - - 0 with probability 1 - 1 / s - - +sqrt(s) / sqrt(n_components) with probability 1 / 2s + - ``-sqrt(s) / sqrt(n_components)`` - with probability ``1 / 2s`` + - ``0`` - with probability ``1 - 1 / s`` + - ``+sqrt(s) / sqrt(n_components)`` - with probability ``1 / 2s`` - Example - --------- + Examples + -------- .. code-block:: python + from cuml.random_projection import SparseRandomProjection from sklearn.datasets.samples_generator import make_blobs from sklearn.svm import SVC @@ -426,11 +429,11 @@ class SparseRandomProjection(Base, BaseRandomProjection): Output: .. code-block:: python + Score: 1.0 Parameters ---------- - handle : cuml.Handle If it is None, a new one is created just for this class @@ -439,13 +442,11 @@ class SparseRandomProjection(Base, BaseRandomProjection): the parameter is deducted thanks to Johnson–Lindenstrauss lemma. The automatic deduction make use of the number of samples and the eps parameter. - The Johnson–Lindenstrauss lemma can produce very conservative n_components parameter as it makes no assumption on dataset structure. density : float in range (0, 1] (default = 'auto') Ratio of non-zero component in the random projection matrix. - If density = 'auto', the value is set to the minimum density as recommended by Ping Li et al.: 1 / sqrt(n_features). @@ -461,14 +462,14 @@ class SparseRandomProjection(Base, BaseRandomProjection): Attributes ---------- - gaussian_method : boolean - To be passed to base class in order to determine - random matrix generation method + gaussian_method : boolean + To be passed to base class in order to determine + random matrix generation method Notes - ------ - Inspired by Scikit-learn's implementation : - https://scikit-learn.org/stable/modules/random_projection.html + ----- + Inspired by Scikit-learn's `implementation + `_ """ diff --git a/python/cuml/solvers/qn.pyx b/python/cuml/solvers/qn.pyx index a1208a5177..82cb4cd418 100644 --- a/python/cuml/solvers/qn.pyx +++ b/python/cuml/solvers/qn.pyx @@ -137,7 +137,7 @@ class QN(Base): NumPy arrays or in device (as Numba or __cuda_array_interface__ compliant). Examples - --------- + -------- .. code-block:: python import cudf diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx index dd031fd2a5..47e2fe02ad 100644 --- a/python/cuml/solvers/sgd.pyx +++ b/python/cuml/solvers/sgd.pyx @@ -127,7 +127,7 @@ class SGD(Base): ridge regression and SVM models. Examples - --------- + -------- .. code-block:: python @@ -164,35 +164,35 @@ class SGD(Base): Parameters ----------- loss : 'hinge', 'log', 'squared_loss' (default = 'squared_loss') - 'hinge' uses linear SVM - 'log' uses logistic regression - 'squared_loss' uses linear regression + 'hinge' uses linear SVM + 'log' uses logistic regression + 'squared_loss' uses linear regression penalty: 'none', 'l1', 'l2', 'elasticnet' (default = 'none') - 'none' does not perform any regularization - 'l1' performs L1 norm (Lasso) which minimizes the sum of the abs value - of coefficients - 'l2' performs L2 norm (Ridge) which minimizes the sum of the square of - the coefficients - 'elasticnet' performs Elastic Net regularization which is a weighted - average of L1 and L2 norms + 'none' does not perform any regularization + 'l1' performs L1 norm (Lasso) which minimizes the sum of the abs value + of coefficients + 'l2' performs L2 norm (Ridge) which minimizes the sum of the square of + the coefficients + 'elasticnet' performs Elastic Net regularization which is a weighted + average of L1 and L2 norms alpha: float (default = 0.0001) The constant value which decides the degree of regularization fit_intercept : boolean (default = True) - If True, the model tries to correct for the global mean of y. - If False, the model expects that you have centered the data. + If True, the model tries to correct for the global mean of y. + If False, the model expects that you have centered the data. epochs : int (default = 1000) The number of times the model should iterate through the entire dataset during training (default = 1000) tol : float (default = 1e-3) - The training process will stop if current_loss > previous_loss - tol + The training process will stop if current_loss > previous_loss - tol shuffle : boolean (default = True) - True, shuffles the training data after each epoch - False, does not shuffle the training data after each epoch + True, shuffles the training data after each epoch + False, does not shuffle the training data after each epoch eta0 : float (default = 0.001) Initial learning rate power_t : float (default = 0.5) The exponent used for calculating the invscaling learning rate - learning_rate : 'optimal', 'constant', 'invscaling', + learning_rate : 'optimal', 'constant', 'invscaling', \ 'adaptive' (default = 'constant') optimal option supported in the next version constant keeps the learning rate constant diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx index aa1e0409d3..f9fcaaf0cc 100644 --- a/python/cuml/svm/svc.pyx +++ b/python/cuml/svm/svc.pyx @@ -130,7 +130,7 @@ class SVC(SVMBase, ClassifierMixin): - predict_proba is not yet supported Examples - --------- + -------- .. code-block:: python import numpy as np @@ -278,7 +278,7 @@ class SVC(SVMBase, ClassifierMixin): y_m: device array of floats or doubles, shape = (n_samples, 1) Array of target labels already copied to the device. - Returns: + Returns -------- sample_weight: device array shape = (n_samples, 1) or None """ diff --git a/python/cuml/svm/svr.pyx b/python/cuml/svm/svr.pyx index 40d1b89654..29628b8eda 100644 --- a/python/cuml/svm/svr.pyx +++ b/python/cuml/svm/svr.pyx @@ -106,24 +106,6 @@ class SVR(SVMBase, RegressorMixin): Construct an SVC classifier for training and predictions. - Examples - --------- - .. code-block:: python - - import numpy as np - from cuml.svm import SVR - X = np.array([[1], [2], [3], [4], [5]], dtype=np.float32) - y = np.array([1.1, 4, 5, 3.9, 1.], dtype = np.float32) - reg = SVR(kernel='rbf', gamma='scale', C=10, epsilon=0.1) - reg.fit(X, y) - print("Predicted values:", reg.predict(X)) - - Output: - - .. code-block:: none - - Predicted values: [1.200474 3.8999617 5.100488 3.7995374 1.0995375] - Parameters ---------- handle : cuml.Handle @@ -138,8 +120,9 @@ class SVR(SVMBase, RegressorMixin): gamma : float or string (default = 'scale') Coefficient for rbf, poly, and sigmoid kernels. You can specify the numeric value, or use one of the following options: - - 'auto': gamma will be set to 1 / n_features - - 'scale': gamma will be se to 1 / (n_features * X.var()) + - 'auto': gamma will be set to ``1 / n_features`` + - 'scale': gamma will be se to ``1 / (n_features * X.var())`` + coef0 : float (default = 0.0) Independent term in kernel function, only signifficant for poly and sigmoid @@ -186,25 +169,46 @@ class SVR(SVMBase, RegressorMixin): coef_ : float, shape [1, n_cols] Only available for linear kernels. It is the normal of the hyperplane. - coef_ = sum_k=1..n_support dual_coef_[k] * support_vectors[k,:] - + ``coef_ = sum_k=1..n_support dual_coef_[k] * support_vectors[k,:]`` Notes ----- + For additional docs, see `Scikit-learn's SVR `_. The solver uses the SMO method to fit the regressor. We use the Optimized - Hierarchical Decomposition [1] variant of the SMO algorithm, similar to [2] + Hierarchical Decomposition [1]_ variant of the SMO algorithm, similar to [2]_ References ---------- - [1] J. Vanek et al. A GPU-Architecture Optimized Hierarchical Decomposition - Algorithm for Support VectorMachine Training, IEEE Transactions on - Parallel and Distributed Systems, vol 28, no 12, 3330, (2017) - [2] Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, Journal - * of Machine Learning Research, 19, 1-5 (2018) - https://github.com/Xtra-Computing/thundersvm + + .. [1] J. Vanek et al. A GPU-Architecture Optimized Hierarchical Decomposition + Algorithm for Support VectorMachine Training, IEEE Transactions on + Parallel and Distributed Systems, vol 28, no 12, 3330, (2017) + + .. [2] `Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, Journal + of Machine Learning Research, 19, 1-5 (2018) + `_ + + Examples + -------- + + .. code-block:: python + + import numpy as np + from cuml.svm import SVR + X = np.array([[1], [2], [3], [4], [5]], dtype=np.float32) + y = np.array([1.1, 4, 5, 3.9, 1.], dtype = np.float32) + reg = SVR(kernel='rbf', gamma='scale', C=10, epsilon=0.1) + reg.fit(X, y) + print("Predicted values:", reg.predict(X)) + + Output: + + .. code-block:: python + + Predicted values: [1.200474 3.8999617 5.100488 3.7995374 1.0995375] """ def __init__(self, handle=None, C=1, kernel='rbf', degree=3, diff --git a/python/cuml/test/test_arima.py b/python/cuml/test/test_arima.py index 5d45a28228..4dcf32d2a5 100644 --- a/python/cuml/test/test_arima.py +++ b/python/cuml/test/test_arima.py @@ -261,9 +261,12 @@ def _statsmodels_to_cuml(ref_fits, cuml_model, order, seasonal_order, intercept, dtype): """Utility function to transfer the parameters from a statsmodels' SARIMAXResults object to a cuML ARIMA object. - Note: be cautious with the intercept, it is not always equivalent - in statsmodels and cuML models (it depends on the order). + + .. note:: be cautious with the intercept, it is not always equivalent + in statsmodels and cuML models (it depends on the order). + """ + nb = cuml_model.batch_size N = cuml_model.complexity x = np.zeros(nb * N, dtype=np.float64) @@ -359,8 +362,11 @@ def test_loglikelihood(key, data, dtype): @pytest.mark.parametrize('key, data', test_data) @pytest.mark.parametrize('dtype', [np.float64]) def test_gradient(key, data, dtype): - """Test batched gradient implementation against scipy non-batched - gradient. Note: it doesn't test that the loglikelihood is correct! + """ + Test batched gradient implementation against scipy non-batched + gradient. + + .. note:: it doesn't test that the loglikelihood is correct! """ order, seasonal_order, intercept = extract_order(key) p, _, q = order diff --git a/python/cuml/test/test_pickle.py b/python/cuml/test/test_pickle.py index 7a4b0fe559..59d3bd8f59 100644 --- a/python/cuml/test/test_pickle.py +++ b/python/cuml/test/test_pickle.py @@ -417,7 +417,7 @@ def assert_model(pickled_model, X_test): def test_neighbors_pickle_nofit(tmpdir, datatype, data_info): result = {} """ - Note: This test digs down a bit far into the + .. note:: This test digs down a bit far into the internals of the implementation, but it's important that regressions do not occur from changes to the class. diff --git a/python/cuml/test/test_svm.py b/python/cuml/test/test_svm.py index 080a5ade7f..28cad10608 100644 --- a/python/cuml/test/test_svm.py +++ b/python/cuml/test/test_svm.py @@ -461,9 +461,11 @@ def get_memsize(svc): def test_svm_memleak(params, n_rows, n_iter, n_cols, use_handle, dataset='blobs'): """ - Test whether there is any memory leak. Note: small n_rows, and n_cols - values will result in small model size, that will not be measured by - get_memory_info. + Test whether there is any memory leak. + + .. note:: small `n_rows`, and `n_cols` values will result in small model + size, that will not be measured by get_memory_info. + """ X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) stream = cuml.cuda.Stream() diff --git a/python/cuml/tsa/arima.pyx b/python/cuml/tsa/arima.pyx index 40bf468a3d..1563058e6b 100644 --- a/python/cuml/tsa/arima.pyx +++ b/python/cuml/tsa/arima.pyx @@ -101,7 +101,7 @@ class ARIMA(Base): large batches of time series. Examples - --------- + -------- .. code-block:: python import numpy as np @@ -183,8 +183,8 @@ class ARIMA(Base): Performance ----------- - Let `r=max(p+s*P, q+s*Q+1)`. The device memory used for most operations - is `O(batch_size*n_obs + batch_size*r^2)`. The execution time is a linear + Let ``r=max(p+s*P, q+s*Q+1)``. The device memory used for most operations + is ``O(batch_size*n_obs + batch_size*r^2)``. The execution time is a linear function of `n_obs` and `batch_size` (if `batch_size` is large), but grows very fast with `r`. @@ -200,6 +200,7 @@ class ARIMA(Base): Additionally the following book is a useful reference: "Time Series Analysis by State Space Methods", J. Durbin, S.J. Koopman, 2nd Edition (2012). + """ def __init__(self, @@ -369,7 +370,7 @@ class ARIMA(Base): def get_params(self) -> Dict[str, np.ndarray]: """Get the parameters of the model - Returns: + Returns -------- params: Dict[str, np.ndarray] A dictionary of parameter names and associated arrays @@ -390,8 +391,8 @@ class ARIMA(Base): def set_params(self, params: Mapping[str, object]): """Set the parameters of the model - Parameters: - -------- + Parameters + ---------- params: Mapping[str, np.ndarray] A mapping (e.g dictionary) of parameter names and associated arrays The key names are in {"mu", "ar", "ma", "sar", "sma", "sigma2"} @@ -408,23 +409,24 @@ class ARIMA(Base): def predict(self, start=0, end=None): """Compute in-sample and/or out-of-sample prediction for each series - Parameters: - ----------- + Parameters + ---------- start: int Index where to start the predictions (0 <= start <= num_samples) end: Index where to end the predictions, excluded (end > start) - Returns: + Returns -------- y_p : array-like (device) Predictions. Shape = (end - start, batch_size) - Example: + Examples -------- .. code-block:: python + from cuml.tsa.arima import ARIMA - ... + model = ARIMA(ys, (1,1,1)) model.fit() y_pred = model.predict() @@ -505,19 +507,20 @@ class ARIMA(Base): def forecast(self, nsteps: int): """Forecast the given model `nsteps` into the future. - Parameters: + Parameters ---------- nsteps : int The number of steps to forecast beyond end of the given series - Returns: + Returns -------- y_fc : array-like Forecasts. Shape = (nsteps, batch_size) - Example: + Examples -------- .. code-block:: python + from cuml.tsa.arima import ARIMA ... model = ARIMA(ys, (1,1,1)) @@ -611,16 +614,21 @@ class ARIMA(Base): number of parameters of this type. Pass None for automatic estimation (recommended) opt_disp : int - Fit diagnostic level (for L-BFGS solver): + Fit diagnostic level (for L-BFGS solver) : * `-1` for no output (default) * `0100` for more detailed output + h : float Finite-differencing step size. The gradient is computed using second-order differencing: + + .. code-block + f(x+h) - f(x - h) g = ----------------- + O(h^2) 2 * h + maxiter : int Maximum number of iterations of L-BFGS-B method : str @@ -688,7 +696,7 @@ class ARIMA(Base): def _loglike(self, x, trans=True, method="ml", truncate=0): """Compute the batched log-likelihood for the given parameters. - Parameters: + Parameters ---------- x : array-like Packed parameter array, grouped by series @@ -702,8 +710,8 @@ class ARIMA(Base): When using CSS, start the sum of squares after a given number of observations - Returns: - -------- + Returns + ------- loglike : numpy.ndarray Batched log-likelihood. Shape: (batch_size,) """ @@ -738,7 +746,7 @@ class ARIMA(Base): """Compute the gradient (via finite differencing) of the batched log-likelihood. - Parameters: + Parameters ---------- x : array-like Packed parameter array, grouped by series. @@ -755,8 +763,8 @@ class ARIMA(Base): When using CSS, start the sum of squares after a given number of observations - Returns: - -------- + Returns + ------- grad : numpy.ndarray Batched log-likelihood gradient. Shape: (n_params * batch_size,) where n_params is the complexity of the model @@ -797,8 +805,8 @@ class ARIMA(Base): """Unpack linearized parameter vector `x` into the separate parameter arrays of the model - Parameters: - ----------- + Parameters + ---------- x : array-like Packed parameter array, grouped by series. Shape: (n_params * batch_size,) @@ -833,8 +841,8 @@ class ARIMA(Base): def pack(self) -> np.ndarray: """Pack parameters of the model into a linearized vector `x` - Returns: - ----------- + Returns + ------- x : array-like Packed parameter array, grouped by series. Shape: (n_params * batch_size,) @@ -866,14 +874,14 @@ class ARIMA(Base): def _batched_transform(self, x, isInv=False): """Applies Jones transform or inverse transform to a parameter vector - Parameters: - ----------- + Parameters + ---------- x : array-like Packed parameter array, grouped by series. Shape: (n_params * batch_size,) - Returns: - ----------- + Returns + ------- Tx : array-like Packed transformed parameter array, grouped by series. Shape: (n_params * batch_size,) diff --git a/python/cuml/tsa/auto_arima.pyx b/python/cuml/tsa/auto_arima.pyx index 9a0ae5ecef..820d7db9f6 100644 --- a/python/cuml/tsa/auto_arima.pyx +++ b/python/cuml/tsa/auto_arima.pyx @@ -112,8 +112,8 @@ class AutoARIMA(Base): It provides an abstraction around the underlying ARIMA models to predict and forecast as if using a single model. - Example - ------- + Examples + -------- .. code-block:: python from cuml.tsa.auto_arima import AutoARIMA @@ -410,7 +410,7 @@ class AutoARIMA(Base): end: Index where to end the predictions, excluded (end > start) - Returns: + Returns -------- y_p : array-like (device) Predictions. Shape = (end - start, batch_size) @@ -433,7 +433,7 @@ class AutoARIMA(Base): nsteps : int The number of steps to forecast beyond end of the given series - Returns: + Returns -------- y_fc : array-like Forecasts. Shape = (nsteps, batch_size) @@ -467,8 +467,9 @@ def _parse_sequence(name, seq_in, min_accepted, max_accepted): def _divide_by_mask(original, mask, batch_id, handle=None): """Divide a given batch into two sub-batches according to a boolean mask - Note: in case the mask contains only False or only True, one sub-batch - will be the original batch (not a copy!) and the other None + + .. note:: in case the mask contains only False or only True, one sub-batch + will be the original batch (not a copy!) and the other None Parameters: ---------- @@ -481,7 +482,7 @@ def _divide_by_mask(original, mask, batch_id, handle=None): handle : cuml.Handle If it is None, a new one is created just for this call - Returns: + Returns -------- out0 : cumlArray (float32 or float64) Sub-batch 0, or None if empty @@ -600,7 +601,7 @@ def _divide_by_min(original, metrics, batch_id, handle=None): handle : cuml.Handle If it is None, a new one is created just for this call - Returns: + Returns -------- sub_batches : List[cumlArray] (float32 or float64) List of arrays containing each sub-batch, or None if empty @@ -715,7 +716,7 @@ def _build_division_map(id_tracker, batch_size, handle=None): batch_size : int Size of the initial batch - Returns: + Returns -------- id_to_model : cumlArray (int) Associates each batch member with a model @@ -771,7 +772,7 @@ def _merge_series(data_in, id_to_sub, id_to_pos, batch_size, handle=None): batch_size : int Size of the initial batch - Returns: + Returns -------- data_out : cumlArray (float32 or float64) Merged batch diff --git a/python/cuml/tsa/holtwinters.pyx b/python/cuml/tsa/holtwinters.pyx index d626093ed7..af34be6bc1 100644 --- a/python/cuml/tsa/holtwinters.pyx +++ b/python/cuml/tsa/holtwinters.pyx @@ -76,19 +76,19 @@ class ExponentialSmoothing(Base): ----------------- This version of ExponentialSmoothing currently provides only a limited number of features when compared to the - statsmodels.holtwinters.ExponentialSmoothing model. Noticeably, it lacks: - - * predict : no support for in-sample prediction. - https://github.com/rapidsai/cuml/issues/875 - - * hessian : no support for returning Hessian matrix. - https://github.com/rapidsai/cuml/issues/880 - - * information : no support for returning Fisher matrix. - https://github.com/rapidsai/cuml/issues/880 - - * loglike : no support for returning Log-likelihood. - https://github.com/rapidsai/cuml/issues/880 + `statsmodels.holtwinters.ExponentialSmoothing` model. Noticeably, it lacks: + + * predict : no support for in-sample prediction. + * https://github.com/rapidsai/cuml/issues/875 + + * hessian : no support for returning Hessian matrix. + * https://github.com/rapidsai/cuml/issues/880 + + * information : no support for returning Fisher matrix. + * https://github.com/rapidsai/cuml/issues/880 + + * loglike : no support for returning Log-likelihood. + * https://github.com/rapidsai/cuml/issues/880 Additionally, be warned that there may exist floating point instability issues in this model. Small values in endog may lead to faulty results. @@ -101,39 +101,39 @@ class ExponentialSmoothing(Base): * Cannot pass trend component or damped trend component * this version can take additional parameters `eps`, - `start_periods`, `ts_num`, and `handle` + `start_periods`, `ts_num`, and `handle` * Score returns SSE rather than gradient logL - https://github.com/rapidsai/cuml/issues/876 + https://github.com/rapidsai/cuml/issues/876 * This version provides get_level(), get_trend(), get_season() Examples -------- .. code-block:: python - from cuml import ExponentialSmoothing - import cudf - import numpy as np - data = cudf.Series([1, 2, 3, 4, 5, 6, - 7, 8, 9, 10, 11, 12, - 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, - 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14], - dtype=np.float64) - cu_hw = ExponentialSmoothing(data, seasonal_periods=12) - cu_hw.fit() - cu_pred = cu_hw.forecast(4) - print('Forecasted points:', cu_pred) - Output - + from cuml import ExponentialSmoothing + import cudf + import numpy as np + data = cudf.Series([1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, + 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, + 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14], + dtype=np.float64) + cu_hw = ExponentialSmoothing(data, seasonal_periods=12) + cu_hw.fit() + cu_pred = cu_hw.forecast(4) + print('Forecasted points:', cu_pred) + + Output: .. code-block:: python - Forecasted points : - 0 4.000143766093652 - 1 5.000000163513641 - 2 6.000000000174092 - 3 7.000000000000178 + Forecasted points : + 0 4.000143766093652 + 1 5.000000163513641 + 2 6.000000000174092 + 3 7.000000000000178 Parameters ---------- @@ -144,7 +144,8 @@ class ExponentialSmoothing(Base): Note: cuDF.DataFrame types assumes data is in columns, while all other datatypes assume data is in rows. The endogenous dataset to be operated on. - seasonal : 'additive', 'add', 'multiplicative', 'mul' (default = 'additive') # noqa + seasonal : 'additive', 'add', 'multiplicative', 'mul' \ + (default = 'additive') Whether the seasonal trend should be calculated additively or multiplicatively. seasonal_periods : int (default=2) @@ -435,8 +436,8 @@ class ExponentialSmoothing(Base): """ Returns the score of the model. - **Note: Currently returns the SSE, rather than the gradient of the - LogLikelihood. https://github.com/rapidsai/cuml/issues/876 + .. note:: Currently returns the SSE, rather than the gradient of the + LogLikelihood. https://github.com/rapidsai/cuml/issues/876 Parameters ---------- From 874b3de5a0fbf750fecb52dda22be55bc4df7ebf Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Thu, 6 Aug 2020 23:14:54 -0600 Subject: [PATCH 09/15] Style cleanup --- docs/source/conf.py | 2 +- python/cuml/common/array.py | 12 +++---- python/cuml/dask/datasets/regression.py | 4 +-- python/cuml/dask/manifold/umap.py | 3 +- .../feature_extraction/_tfidf_vectorizer.py | 6 ++-- .../cuml/feature_extraction/_vectorizers.py | 32 ++++++++++--------- python/cuml/metrics/_ranking.py | 29 +++++++++-------- python/cuml/preprocessing/encoders.py | 2 +- python/cuml/test/test_arima.py | 2 +- python/cuml/test/test_svm.py | 4 +-- 10 files changed, 50 insertions(+), 46 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 1db62bd9bd..615247e900 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ sys.path.insert(0, os.path.abspath('sphinxext')) sys.path.insert(0, os.path.abspath('../../python')) -from github_link import make_linkcode_resolve +from github_link import make_linkcode_resolve # noqa # -- General configuration ------------------------------------------------ diff --git a/python/cuml/common/array.py b/python/cuml/common/array.py index d5c22d6638..869d9fff0a 100644 --- a/python/cuml/common/array.py +++ b/python/cuml/common/array.py @@ -31,16 +31,16 @@ class CumlArray(Buffer): """ Array represents an abstracted array allocation. It can be instantiated by itself, creating an rmm.DeviceBuffer underneath, or can be instantiated by - ``__cuda_array_interface__`` or ``__array_interface__`` compliant arrays, in which - case it'll keep a reference to that data underneath. Also can be created - from a pointer, specifying the characteristics of the array, in that case - the owner of the data referred to by the pointer should be specified - explicitly. + ``__cuda_array_interface__`` or ``__array_interface__`` compliant arrays, + in which case it'll keep a reference to that data underneath. Also can be + created from a pointer, specifying the characteristics of the array, in + that case the owner of the data referred to by the pointer should be + specified explicitly. Parameters ---------- - data : rmm.DeviceBuffer, cudf.Buffer, array_like, int, bytes, bytearray or \ + data : rmm.DeviceBuffer, cudf.Buffer, array_like, int, bytes, bytearray or\ memoryview An array-like object or integer representing a device or host pointer to pre-allocated memory. diff --git a/python/cuml/dask/datasets/regression.py b/python/cuml/dask/datasets/regression.py index 4802b6b792..e2e439d691 100644 --- a/python/cuml/dask/datasets/regression.py +++ b/python/cuml/dask/datasets/regression.py @@ -279,7 +279,7 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, use_full_low_rank=True): """ Generate a random regression problem. - + The input set can either be well conditioned (by default) or have a low rank-fat tail singular profile. @@ -362,7 +362,7 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, 3. When `shuffle = True` and `order = F`, there are memory spikes to \ shuffle the `F` order arrays - .. note:: If out-of-memory errors are encountered in any of the above + .. note:: If out-of-memory errors are encountered in any of the above configurations, try increasing the `n_parts` parameter. """ diff --git a/python/cuml/dask/manifold/umap.py b/python/cuml/dask/manifold/umap.py index 0cb6d554f7..9e071283fc 100644 --- a/python/cuml/dask/manifold/umap.py +++ b/python/cuml/dask/manifold/umap.py @@ -65,7 +65,8 @@ class UMAP(BaseEstimator, Notes ----- - This module is heavily based on Leland McInnes' reference UMAP package [1]_. + This module is heavily based on Leland McInnes' reference UMAP package + [1]_. However, there are a number of differences and features that are not yet implemented in `cuml.umap`: * Using a non-Euclidean distance metric (support for a fixed set diff --git a/python/cuml/feature_extraction/_tfidf_vectorizer.py b/python/cuml/feature_extraction/_tfidf_vectorizer.py index c774ae11dc..db0b419073 100644 --- a/python/cuml/feature_extraction/_tfidf_vectorizer.py +++ b/python/cuml/feature_extraction/_tfidf_vectorizer.py @@ -94,9 +94,9 @@ class TfidfVectorizer(CountVectorizer): Typically the delimiting character between words is a good choice. norm : {'l1', 'l2'}, default='l2' Each output row will have unit norm, either: - * 'l2': Sum of squares of vector elements is 1. The cosine \ - similarity between two vectors is their dot product when l2 norm has \ - been applied. + * 'l2': Sum of squares of vector elements is 1. The cosine + similarity between two vectors is their dot product when l2 norm + has been applied. * 'l1': Sum of absolute values of vector elements is 1. use_idf : bool, default=True diff --git a/python/cuml/feature_extraction/_vectorizers.py b/python/cuml/feature_extraction/_vectorizers.py index 9fc2f01b12..e2ee8aa287 100644 --- a/python/cuml/feature_extraction/_vectorizers.py +++ b/python/cuml/feature_extraction/_vectorizers.py @@ -506,7 +506,7 @@ def fit(self, raw_documents): raw_documents : cudf.Series A Series of string documents - + Returns ------- self @@ -519,7 +519,8 @@ def fit_transform(self, raw_documents): """ Build the vocabulary and return document-term matrix. - Equivalent to ``self.fit(X).transform(X)`` but preprocess `X` only once. + Equivalent to ``self.fit(X).transform(X)`` but preprocess `X` only + once. Parameters ---------- @@ -653,22 +654,22 @@ class HashingVectorizer(_VectorizerMixin): This strategy has several advantages: - - it is very low memory scalable to large datasets as there is no need to\ - store a vocabulary dictionary in memory which is even more important \ - as GPU's that are often memory constrained - - it is fast to pickle and un-pickle as it holds no state besides the \ + - it is very low memory scalable to large datasets as there is no need to + store a vocabulary dictionary in memory which is even more important as + GPU's that are often memory constrained + - it is fast to pickle and un-pickle as it holds no state besides the constructor parameters - - it can be used in a streaming (partial fit) or parallel pipeline as there \ - is no state computed during fit. + - it can be used in a streaming (partial fit) or parallel pipeline as + there is no state computed during fit. There are also a couple of cons (vs using a CountVectorizer with an in-memory vocabulary): - - there is no way to compute the inverse transform (from feature indices to \ - string feature names) which can be a problem when trying to introspect \ - which features are most important to a model. - - there can be collisions: distinct tokens can be mapped to the same \ - feature index. However in practice this is rarely an issue if n_features \ + - there is no way to compute the inverse transform (from feature indices + to string feature names) which can be a problem when trying to + introspect which features are most important to a model. + - there can be collisions: distinct tokens can be mapped to the same + feature index. However in practice this is rarely an issue if n_features is large enough (e.g. 2 ** 18 for text classification problems). - no IDF weighting as this would render the transformer stateful. @@ -716,8 +717,9 @@ class HashingVectorizer(_VectorizerMixin): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). delimiter : str, whitespace by default - String used as a replacement for stop words if `stop_words` is not None. - Typically the delimiting character between words is a good choice. + String used as a replacement for stop words if `stop_words` is not + None. Typically the delimiting character between words is a good + choice. Examples -------- diff --git a/python/cuml/metrics/_ranking.py b/python/cuml/metrics/_ranking.py index f50d8fc7c0..908bf25e4b 100644 --- a/python/cuml/metrics/_ranking.py +++ b/python/cuml/metrics/_ranking.py @@ -26,20 +26,21 @@ def precision_recall_curve(y_true, probs_pred): """ Compute precision-recall pairs for different probability thresholds - .. note:: this implementation is restricted to the binary classification task. - The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of - true positives and ``fp`` the number of false positives. The precision is - intuitively the ability of the classifier not to label as positive a sample - that is negative. - - The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of - true positives and ``fn`` the number of false negatives. The recall is - intuitively the ability of the classifier to find all the positive samples. - The last precision and recall values are 1. and 0. respectively and do not - have a corresponding threshold. This ensures that the graph starts on the - y axis. - - Read more in the :ref:`User Guide `. + .. note:: this implementation is restricted to the binary classification + task. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the + number of true positives and ``fp`` the number of false positives. The + precision is intuitively the ability of the classifier not to label as + positive a sample that is negative. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number + of true positives and ``fn`` the number of false negatives. The recall + is intuitively the ability of the classifier to find all the positive + samples. The last precision and recall values are 1. and 0. + respectively and do not have a corresponding threshold. This ensures + that the graph starts on the y axis. + + Read more in the + :ref:`User Guide `. Parameters ---------- diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 0b4db88b5b..5d414c0e68 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -37,7 +37,7 @@ class OneHotEncoder(Base): By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. - + .. note:: a one-hot encoding of y labels should use a LabelBinarizer instead. diff --git a/python/cuml/test/test_arima.py b/python/cuml/test/test_arima.py index 4dcf32d2a5..65b18e4340 100644 --- a/python/cuml/test/test_arima.py +++ b/python/cuml/test/test_arima.py @@ -365,7 +365,7 @@ def test_gradient(key, data, dtype): """ Test batched gradient implementation against scipy non-batched gradient. - + .. note:: it doesn't test that the loglikelihood is correct! """ order, seasonal_order, intercept = extract_order(key) diff --git a/python/cuml/test/test_svm.py b/python/cuml/test/test_svm.py index 28cad10608..14780f889c 100644 --- a/python/cuml/test/test_svm.py +++ b/python/cuml/test/test_svm.py @@ -461,8 +461,8 @@ def get_memsize(svc): def test_svm_memleak(params, n_rows, n_iter, n_cols, use_handle, dataset='blobs'): """ - Test whether there is any memory leak. - + Test whether there is any memory leak. + .. note:: small `n_rows`, and `n_cols` values will result in small model size, that will not be measured by get_memory_info. From 8c17febdbaf4f1ebc5bfaabfb5569995399b6fde Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Thu, 6 Aug 2020 23:40:23 -0600 Subject: [PATCH 10/15] More style cleanup --- .../cuml/ensemble/randomforestregressor.pyx | 9 ++-- python/cuml/fil/fil.pyx | 46 +++++++++---------- python/cuml/manifold/umap.pyx | 8 ++-- python/cuml/svm/svr.pyx | 14 +++--- python/cuml/tsa/holtwinters.pyx | 6 +-- 5 files changed, 43 insertions(+), 40 deletions(-) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index c86b32c821..61b6ab3e0f 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -110,11 +110,12 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): """ Implements a Random Forest regressor model which fits multiple decision trees in an ensemble. - - .. note:: that the underlying algorithm for tree node splits differs from that - used in scikit-learn. By default, the cuML Random Forest uses a + + .. note:: that the underlying algorithm for tree node splits differs from + that used in scikit-learn. By default, the cuML Random Forest uses a histogram-based algorithm to determine splits, rather than an exact - count. You can tune the size of the histograms with the n_bins parameter. + count. You can tune the size of the histograms with the n_bins + parameter. **Known Limitations**: This is an early release of the cuML Random Forest code. It contains a few known limitations: diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 9fda25a9d1..507ebee5af 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -254,7 +254,7 @@ cdef class ForestInference_impl(): ------- Predicted results of type as defined by the output_type variable - + """ if (not self.output_class) and predict_proba: raise NotImplementedError("Predict_proba function is not available" @@ -520,24 +520,24 @@ class ForestInference(Base): For a Regression model output_class must be False. algo : string (default='auto') name of the algo from (from algo_t enum) : - - 'AUTO' or 'auto' - choose the algorithm automatically; \ - currently 'BATCH_TREE_REORG' is used for dense storage, \ - and 'NAIVE' for sparse storage + - 'AUTO' or 'auto' - choose the algorithm automatically; + currently 'BATCH_TREE_REORG' is used for dense storage, + and 'NAIVE' for sparse storage - 'NAIVE' or 'naive' - simple inference using shared memory - - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees \ - rearranged to be more coalescing-friendly - - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG \ - but predicting multiple rows per thread block + - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees + rearranged to be more coalescing-friendly + - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG + but predicting multiple rows per thread block threshold : float (default=0.5) Threshold is used to for classification. It is applied only if output_class == True, else it is ignored. storage_type : string or boolean (default='auto') In-memory storage format to be used for the FIL model: - - 'auto' - choose the storage type automatically \ - (currently DENSE is always used) + - 'auto' - choose the storage type automatically + (currently DENSE is always used) - False - create a dense forest - - True - create a sparse forest; \ - requires algo='NAIVE' or algo='AUTO' + - True - create a sparse forest; + requires algo='NAIVE' or algo='AUTO' Returns ---------- @@ -575,24 +575,24 @@ class ForestInference(Base): For a Regression model output_class must be False. algo : string (default='auto') name of the algo from (from algo_t enum): - - 'AUTO' or 'auto' - choose the algorithm automatically; \ - currently 'BATCH_TREE_REORG' is used for dense storage, \ - and 'NAIVE' for sparse storage + - 'AUTO' or 'auto' - choose the algorithm automatically; + currently 'BATCH_TREE_REORG' is used for dense storage, + and 'NAIVE' for sparse storage - 'NAIVE' or 'naive' - simple inference using shared memory - - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees \ - rearranged to be more coalescing-friendly - - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG \ - but predicting multiple rows per thread block + - 'TREE_REORG' or 'tree_reorg' - similar to naive but trees + rearranged to be more coalescing-friendly + - 'BATCH_TREE_REORG' or 'batch_tree_reorg' - similar to TREE_REORG + but predicting multiple rows per thread block threshold : float (default=0.5) Threshold is used to for classification. It is applied only if ``output_class == True``, else it is ignored. storage_type : string or boolean (default='auto') In-memory storage format to be used for the FIL model: - - 'auto' - choose the storage type automatically \ - (currently DENSE is always used) + - 'auto' - choose the storage type automatically + (currently DENSE is always used) - False - create a dense forest - - True - create a sparse forest; \ - requires algo='NAIVE' or algo='AUTO' + - True - create a sparse forest; + requires algo='NAIVE' or algo='AUTO' Returns ---------- diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index 308035b00f..c1a5cd2b49 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -231,8 +231,8 @@ class UMAP(Base): edges at once preventing inconsistencies. A lower batch size will yield more consistently repeatable embeddings at the cost of speed. callback: An instance of GraphBasedDimRedCallback class - Used to intercept the internal state of embeddings while they are being trained. - Example of callback usage: + Used to intercept the internal state of embeddings while they are being + trained. Example of callback usage: .. code-block:: python @@ -268,8 +268,8 @@ class UMAP(Base): algorithm for large data sizes while cuml.umap always uses exact kNN. - **Known issue:** If a UMAP model has not yet been fit, it cannot be pickled. - However, after fitting, a UMAP mode. + **Known issue:** If a UMAP model has not yet been fit, it cannot be + pickled. However, after fitting, a UMAP mode. References ---------- diff --git a/python/cuml/svm/svr.pyx b/python/cuml/svm/svr.pyx index 29628b8eda..0c0adf17b1 100644 --- a/python/cuml/svm/svr.pyx +++ b/python/cuml/svm/svr.pyx @@ -178,17 +178,19 @@ class SVR(SVMBase, RegressorMixin): `_. The solver uses the SMO method to fit the regressor. We use the Optimized - Hierarchical Decomposition [1]_ variant of the SMO algorithm, similar to [2]_ + Hierarchical Decomposition [1]_ variant of the SMO algorithm, similar to + [2]_ References ---------- - .. [1] J. Vanek et al. A GPU-Architecture Optimized Hierarchical Decomposition - Algorithm for Support VectorMachine Training, IEEE Transactions on - Parallel and Distributed Systems, vol 28, no 12, 3330, (2017) + .. [1] J. Vanek et al. A GPU-Architecture Optimized Hierarchical + Decomposition Algorithm for Support VectorMachine Training, IEEE + Transactions on Parallel and Distributed Systems, vol 28, no 12, + 3330, (2017) - .. [2] `Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, Journal - of Machine Learning Research, 19, 1-5 (2018) + .. [2] `Z. Wen et al. ThunderSVM: A Fast SVM Library on GPUs and CPUs, + Journal of Machine Learning Research, 19, 1-5 (2018) `_ Examples diff --git a/python/cuml/tsa/holtwinters.pyx b/python/cuml/tsa/holtwinters.pyx index af34be6bc1..49fdf9c47c 100644 --- a/python/cuml/tsa/holtwinters.pyx +++ b/python/cuml/tsa/holtwinters.pyx @@ -80,13 +80,13 @@ class ExponentialSmoothing(Base): * predict : no support for in-sample prediction. * https://github.com/rapidsai/cuml/issues/875 - + * hessian : no support for returning Hessian matrix. * https://github.com/rapidsai/cuml/issues/880 - + * information : no support for returning Fisher matrix. * https://github.com/rapidsai/cuml/issues/880 - + * loglike : no support for returning Log-likelihood. * https://github.com/rapidsai/cuml/issues/880 From ba044271cbf2ddf43bd41eae112db35103219c4c Mon Sep 17 00:00:00 2001 From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com> Date: Wed, 26 Aug 2020 16:17:25 -0600 Subject: [PATCH 11/15] Fixing doc warnings --- docs/source/api.rst | 2 +- python/cuml/preprocessing/TargetEncoder.py | 10 ++++----- python/cuml/tsa/auto_arima.pyx | 26 +++++++++++----------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 611d45ffd8..54cbc823ae 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -70,7 +70,7 @@ Feature and Label Encoding (Single-GPU) .. autoclass:: cuml.preprocessing.OneHotEncoder :members: - .. autoclass:: cuml.preprocessing.TargetEncoder + .. autoclass:: cuml.preprocessing.TargetEncoder.TargetEncoder :members: Feature and Label Encoding (Dask-based Multi-GPU) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index 2a2934ad40..c28839792f 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -23,7 +23,7 @@ class TargetEncoder: """ - A cudf based implementation of target encoding [1], which converts + A cudf based implementation of target encoding [1]_, which converts one or mulitple categorical variables, 'Xs', with the average of corresponding values of the target variable, 'Y'. The input data is grouped by the columns `Xs` and the aggregated mean value of `Y` of @@ -84,11 +84,10 @@ class TargetEncoder: def __init__(self, n_folds=4, smooth=0, seed=42, split_method='interleaved', output_type='auto'): if smooth < 0 or smooth > 1: - raise ValueError( - 'smooth {} is not in range [0,1]'.format(smooth)) + raise ValueError('smooth {} is not in range [0,1]'.format(smooth)) if n_folds < 0 or not isinstance(n_folds, int): raise ValueError( - 'n_folds {} is not a postive integer'.format(n_folds)) + 'n_folds {} is not a postive integer'.format(n_folds)) if output_type not in {'cupy', 'numpy', 'auto'}: msg = ("output_type should be either 'cupy'" " or 'numpy' or 'auto', " @@ -96,8 +95,7 @@ def __init__(self, n_folds=4, smooth=0, seed=42, raise ValueError(msg) if not isinstance(seed, int): - raise ValueError( - 'seed {} is not an integer'.format(seed)) + raise ValueError('seed {} is not an integer'.format(seed)) if split_method not in {'random', 'continuous', 'interleaved'}: msg = ("split_method should be either 'random'" diff --git a/python/cuml/tsa/auto_arima.pyx b/python/cuml/tsa/auto_arima.pyx index 31ab99adad..b854a507eb 100644 --- a/python/cuml/tsa/auto_arima.pyx +++ b/python/cuml/tsa/auto_arima.pyx @@ -409,8 +409,8 @@ class AutoARIMA(Base): def predict(self, start=0, end=None, level=None): """Compute in-sample and/or out-of-sample prediction for each series - Parameters: - ----------- + Parameters + ---------- start: int Index where to start the predictions (0 <= start <= num_samples) end: @@ -420,7 +420,7 @@ class AutoARIMA(Base): the point forecasts. 0 < level < 1 Returns - -------- + ------- y_p : array-like (device) Predictions. Shape = (end - start, batch_size) lower: array-like (device) (optional) @@ -462,7 +462,7 @@ class AutoARIMA(Base): def forecast(self, nsteps: int, level=None): """Forecast `nsteps` into the future. - Parameters: + Parameters ---------- nsteps : int The number of steps to forecast beyond end of the given series @@ -471,7 +471,7 @@ class AutoARIMA(Base): the point forecasts. 0 < level < 1 Returns - -------- + ------- y_fc : array-like Forecasts. Shape = (nsteps, batch_size) lower: array-like (device) (optional) @@ -514,7 +514,7 @@ def _divide_by_mask(original, mask, batch_id, handle=None): .. note:: in case the mask contains only False or only True, one sub-batch will be the original batch (not a copy!) and the other None - Parameters: + Parameters ---------- original : cumlArray (float32 or float64) Original batch @@ -526,7 +526,7 @@ def _divide_by_mask(original, mask, batch_id, handle=None): If it is None, a new one is created just for this call Returns - -------- + ------- out0 : cumlArray (float32 or float64) Sub-batch 0, or None if empty batch0_id : cumlArray (int) @@ -633,7 +633,7 @@ def _divide_by_min(original, metrics, batch_id, handle=None): """Divide a given batch into multiple sub-batches according to the values of the given metrics, by selecting the minimum value for each member - Parameters: + Parameters ---------- original : cumlArray (float32 or float64) Original batch @@ -645,7 +645,7 @@ def _divide_by_min(original, metrics, batch_id, handle=None): If it is None, a new one is created just for this call Returns - -------- + ------- sub_batches : List[cumlArray] (float32 or float64) List of arrays containing each sub-batch, or None if empty sub_id : List[cumlArray] (int) @@ -752,7 +752,7 @@ def _build_division_map(id_tracker, batch_size, handle=None): """Build a map to associate each batch member with a model and index in the associated sub-batch - Parameters: + Parameters ---------- id_tracker : List[cumlArray] (int) List of the index arrays of each sub-batch @@ -760,7 +760,7 @@ def _build_division_map(id_tracker, batch_size, handle=None): Size of the initial batch Returns - -------- + ------- id_to_model : cumlArray (int) Associates each batch member with a model id_to_pos : cumlArray (int) @@ -804,7 +804,7 @@ def _merge_series(data_in, id_to_sub, id_to_pos, batch_size, handle=None): associate each id in the unique batch to a sub-batch and a position in this sub-batch. - Parameters: + Parameters ---------- data_in : List[cumlArray] (float32 or float64) List of sub-batches to merge @@ -816,7 +816,7 @@ def _merge_series(data_in, id_to_sub, id_to_pos, batch_size, handle=None): Size of the initial batch Returns - -------- + ------- data_out : cumlArray (float32 or float64) Merged batch """ From c3a381dde0da098a478daf75bb0117568ccbf6d9 Mon Sep 17 00:00:00 2001 From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com> Date: Wed, 26 Aug 2020 16:22:29 -0600 Subject: [PATCH 12/15] Moving the CHANGELOG PR from 0.15 to 0.16 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77edd20c67..d3a9ffe330 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # cuML 0.16.0 (Date TBD) ## New Features +- PR #2541: Improve Documentation Examples and Source Linking ## Improvements - PR #2735: Update seed to random_state in random forest and associated tests @@ -45,7 +46,6 @@ - PR #2322: Sparse FIL forests with 8-byte nodes - PR #2675: Update conda recipes to support CUDA 11 - PR #2638: Improve cython build with custom `build_ext` -- PR #2541: Improve Documentation Examples and Source Linking ## Improvements - PR #2336: Eliminate `rmm.device_array` usage From 42802869a8dec8a74549c37966fd35d3c4e117dc Mon Sep 17 00:00:00 2001 From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com> Date: Tue, 22 Sep 2020 20:33:19 -0600 Subject: [PATCH 13/15] Adding copyright messages to new files from scikit-learn --- docs/source/_static/copybutton.css | 2 ++ docs/source/_static/example_mod.js | 2 ++ docs/source/sphinxext/github_link.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/docs/source/_static/copybutton.css b/docs/source/_static/copybutton.css index 9ec2ef0ac2..54a5fb3c98 100644 --- a/docs/source/_static/copybutton.css +++ b/docs/source/_static/copybutton.css @@ -1,3 +1,5 @@ +/* This contains code with copyright by the scikit-learn project, subject to the license in /thirdparty/LICENSES/LICENSE.scikit_learn */ + /* copybutton */ /* Adds "Show/Hide Output" button to Examples */ diff --git a/docs/source/_static/example_mod.js b/docs/source/_static/example_mod.js index c8f620048d..ead0a5d302 100644 --- a/docs/source/_static/example_mod.js +++ b/docs/source/_static/example_mod.js @@ -1,3 +1,5 @@ +// This contains code with copyright by the scikit-learn project, subject to the license in /thirdparty/LICENSES/LICENSE.scikit_learn + $(document).ready(function () { /* Add a [>>>] button on the top-right corner of code samples to hide * the >>> and ... prompts and the output and thus make the code diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py index 9fffee58d4..03ccea5088 100644 --- a/docs/source/sphinxext/github_link.py +++ b/docs/source/sphinxext/github_link.py @@ -1,3 +1,5 @@ +# This contains code with copyright by the scikit-learn project, subject to the license in /thirdparty/LICENSES/LICENSE.scikit_learn + import inspect import os import re From 754e18ccbe771dfc55bea53b52e1d82aec9bd6b9 Mon Sep 17 00:00:00 2001 From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com> Date: Wed, 23 Sep 2020 11:37:50 -0600 Subject: [PATCH 14/15] Fixing style issues --- docs/source/_static/copybutton.css | 3 ++- docs/source/_static/example_mod.js | 13 ++++++++++--- docs/source/sphinxext/github_link.py | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/source/_static/copybutton.css b/docs/source/_static/copybutton.css index 54a5fb3c98..5eef6e366d 100644 --- a/docs/source/_static/copybutton.css +++ b/docs/source/_static/copybutton.css @@ -1,4 +1,5 @@ -/* This contains code with copyright by the scikit-learn project, subject to the license in /thirdparty/LICENSES/LICENSE.scikit_learn */ +/* This contains code with copyright by the scikit-learn project, subject to +the license in /thirdparty/LICENSES/LICENSE.scikit_learn */ /* copybutton */ /* Adds "Show/Hide Output" button to Examples */ diff --git a/docs/source/_static/example_mod.js b/docs/source/_static/example_mod.js index ead0a5d302..77dc618a82 100644 --- a/docs/source/_static/example_mod.js +++ b/docs/source/_static/example_mod.js @@ -1,4 +1,5 @@ -// This contains code with copyright by the scikit-learn project, subject to the license in /thirdparty/LICENSES/LICENSE.scikit_learn +// This contains code with copyright by the scikit-learn project, subject to +// the license in /thirdparty/LICENSES/LICENSE.scikit_learn $(document).ready(function () { /* Add a [>>>] button on the top-right corner of code samples to hide @@ -38,14 +39,20 @@ $(document).ready(function () { if (button.data('hidden') === 'false') { // hide the code output button.parent().find('.go, .gp, .gt').hide(); - button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); + button.next('pre') + .find('.gt') + .nextUntil('.gp, .go') + .css('visibility', 'hidden'); button.css('text-decoration', 'line-through'); button.attr('title', show_text); button.data('hidden', 'true'); } else { // show the code output button.parent().find('.go, .gp, .gt').show(); - button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); + button.next('pre') + .find('.gt') + .nextUntil('.gp, .go') + .css('visibility', 'visible'); button.css('text-decoration', 'none'); button.attr('title', hide_text); button.data('hidden', 'false'); diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py index 03ccea5088..a7a46fdd9d 100644 --- a/docs/source/sphinxext/github_link.py +++ b/docs/source/sphinxext/github_link.py @@ -1,4 +1,5 @@ -# This contains code with copyright by the scikit-learn project, subject to the license in /thirdparty/LICENSES/LICENSE.scikit_learn +# This contains code with copyright by the scikit-learn project, subject to the +# license in /thirdparty/LICENSES/LICENSE.scikit_learn import inspect import os From db85d35d45eac22707cd15a2f12470a5e486af89 Mon Sep 17 00:00:00 2001 From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com> Date: Wed, 23 Sep 2020 12:24:27 -0600 Subject: [PATCH 15/15] Fixing lone sphinx warning --- python/cuml/metrics/_ranking.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cuml/metrics/_ranking.py b/python/cuml/metrics/_ranking.py index 908bf25e4b..37263ad627 100644 --- a/python/cuml/metrics/_ranking.py +++ b/python/cuml/metrics/_ranking.py @@ -39,8 +39,9 @@ def precision_recall_curve(y_true, probs_pred): respectively and do not have a corresponding threshold. This ensures that the graph starts on the y axis. - Read more in the - :ref:`User Guide `. + Read more in the scikit-learn's `User Guide + `_. + Parameters ----------