diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 500149b89b08b..7259e8cdb7d61 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -1,3 +1,5 @@ +from itertools import repeat + from .pandas_vb_common import * import scipy.sparse from pandas import SparseSeries, SparseDataFrame @@ -27,6 +29,12 @@ class sparse_frame_constructor(object): def time_sparse_frame_constructor(self): SparseDataFrame(columns=np.arange(100), index=np.arange(1000)) + def time_sparse_from_scipy(self): + SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005)) + + def time_sparse_from_dict(self): + SparseDataFrame(dict(zip(range(1000), repeat([0])))) + class sparse_series_from_coo(object): goal_time = 0.2 diff --git a/asv_bench/vbench_to_asv.py b/asv_bench/vbench_to_asv.py index c3041ec2b1ba1..2a4ce5d183ea2 100644 --- a/asv_bench/vbench_to_asv.py +++ b/asv_bench/vbench_to_asv.py @@ -114,7 +114,7 @@ def translate_module(target_module): l_vars = {} exec('import ' + target_module) in g_vars - print target_module + print(target_module) module = eval(target_module, g_vars) benchmarks = [] @@ -157,7 +157,7 @@ def translate_module(target_module): mod = os.path.basename(module) if mod in ['make.py', 'measure_memory_consumption.py', 'perf_HEAD.py', 'run_suite.py', 'test_perf.py', 'generate_rst_files.py', 'test.py', 'suite.py']: continue - print - print mod + print('') + print(mod) translate_module(mod.replace('.py', '')) diff --git a/bench/alignment.py b/bench/alignment.py deleted file mode 100644 index bc3134f597ee0..0000000000000 --- a/bench/alignment.py +++ /dev/null @@ -1,22 +0,0 @@ -# Setup -from pandas.compat import range, lrange -import numpy as np -import pandas -import la -N = 1000 -K = 50 -arr1 = np.random.randn(N, K) -arr2 = np.random.randn(N, K) -idx1 = lrange(N) -idx2 = lrange(K) - -# pandas -dma1 = pandas.DataFrame(arr1, idx1, idx2) -dma2 = pandas.DataFrame(arr2, idx1[::-1], idx2[::-1]) - -# larry -lar1 = la.larry(arr1, [idx1, idx2]) -lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]]) - -for i in range(100): - result = lar1 + lar2 diff --git a/bench/bench_dense_to_sparse.py b/bench/bench_dense_to_sparse.py deleted file mode 100644 index e1dcd3456e88d..0000000000000 --- a/bench/bench_dense_to_sparse.py +++ /dev/null @@ -1,14 +0,0 @@ -from pandas import * - -K = 100 -N = 100000 -rng = DatetimeIndex('1/1/2000', periods=N, offset=datetools.Minute()) - -rng2 = np.asarray(rng).astype('M8[us]').astype('i8') - -series = {} -for i in range(1, K + 1): - data = np.random.randn(N)[:-i] - this_rng = rng2[:-i] - data[100:] = np.nan - series[i] = SparseSeries(data, index=this_rng) diff --git a/bench/bench_get_put_value.py b/bench/bench_get_put_value.py deleted file mode 100644 index 427e0b1b10a22..0000000000000 --- a/bench/bench_get_put_value.py +++ /dev/null @@ -1,56 +0,0 @@ -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range - -N = 1000 -K = 50 - - -def _random_index(howmany): - return Index([rands(10) for _ in range(howmany)]) - -df = DataFrame(np.random.randn(N, K), index=_random_index(N), - columns=_random_index(K)) - - -def get1(): - for col in df.columns: - for row in df.index: - _ = df[col][row] - - -def get2(): - for col in df.columns: - for row in df.index: - _ = df.get_value(row, col) - - -def put1(): - for col in df.columns: - for row in df.index: - df[col][row] = 0 - - -def put2(): - for col in df.columns: - for row in df.index: - df.set_value(row, col, 0) - - -def resize1(): - buf = DataFrame() - for col in df.columns: - for row in df.index: - buf = buf.set_value(row, col, 5.) - return buf - - -def resize2(): - from collections import defaultdict - - buf = defaultdict(dict) - for col in df.columns: - for row in df.index: - buf[col][row] = 5. - - return DataFrame(buf) diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py deleted file mode 100644 index d7a2853e1e7b2..0000000000000 --- a/bench/bench_groupby.py +++ /dev/null @@ -1,66 +0,0 @@ -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range - -import string -import random - -k = 20000 -n = 10 - -foo = np.tile(np.array([rands(10) for _ in range(k)], dtype='O'), n) -foo2 = list(foo) -random.shuffle(foo) -random.shuffle(foo2) - -df = DataFrame({'A': foo, - 'B': foo2, - 'C': np.random.randn(n * k)}) - -import pandas._sandbox as sbx - - -def f(): - table = sbx.StringHashTable(len(df)) - ret = table.factorize(df['A']) - return ret - - -def g(): - table = sbx.PyObjectHashTable(len(df)) - ret = table.factorize(df['A']) - return ret - -ret = f() - -""" -import pandas._tseries as lib - -f = np.std - - -grouped = df.groupby(['A', 'B']) - -label_list = [ping.labels for ping in grouped.groupings] -shape = [len(ping.ids) for ping in grouped.groupings] - -from pandas.core.groupby import get_group_index - - -group_index = get_group_index(label_list, shape, - sort=True, xnull=True).astype('i4') - -ngroups = np.prod(shape) - -indexer = lib.groupsort_indexer(group_index, ngroups) - -values = df['C'].values.take(indexer) -group_index = group_index.take(indexer) - -f = lambda x: x.std(ddof=1) - -grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups) -result = grouper.get_result() - -expected = grouped.std() -""" diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py deleted file mode 100644 index f3c3f8ba15f70..0000000000000 --- a/bench/bench_join_panel.py +++ /dev/null @@ -1,85 +0,0 @@ -# reasonably efficient - - -def create_panels_append(cls, panels): - """ return an append list of panels """ - panels = [a for a in panels if a is not None] - # corner cases - if len(panels) == 0: - return None - elif len(panels) == 1: - return panels[0] - elif len(panels) == 2 and panels[0] == panels[1]: - return panels[0] - # import pdb; pdb.set_trace() - # create a joint index for the axis - - def joint_index_for_axis(panels, axis): - s = set() - for p in panels: - s.update(list(getattr(p, axis))) - return sorted(list(s)) - - def reindex_on_axis(panels, axis, axis_reindex): - new_axis = joint_index_for_axis(panels, axis) - new_panels = [p.reindex(**{axis_reindex: new_axis, - 'copy': False}) for p in panels] - return new_panels, new_axis - # create the joint major index, dont' reindex the sub-panels - we are - # appending - major = joint_index_for_axis(panels, 'major_axis') - # reindex on minor axis - panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor') - # reindex on items - panels, items = reindex_on_axis(panels, 'items', 'items') - # concatenate values - try: - values = np.concatenate([p.values for p in panels], axis=1) - except Exception as detail: - raise Exception("cannot append values that dont' match dimensions! -> [%s] %s" - % (','.join(["%s" % p for p in panels]), str(detail))) - # pm('append - create_panel') - p = Panel(values, items=items, major_axis=major, - minor_axis=minor) - # pm('append - done') - return p - - -# does the job but inefficient (better to handle like you read a table in -# pytables...e.g create a LongPanel then convert to Wide) -def create_panels_join(cls, panels): - """ given an array of panels's, create a single panel """ - panels = [a for a in panels if a is not None] - # corner cases - if len(panels) == 0: - return None - elif len(panels) == 1: - return panels[0] - elif len(panels) == 2 and panels[0] == panels[1]: - return panels[0] - d = dict() - minor, major, items = set(), set(), set() - for panel in panels: - items.update(panel.items) - major.update(panel.major_axis) - minor.update(panel.minor_axis) - values = panel.values - for item, item_index in panel.items.indexMap.items(): - for minor_i, minor_index in panel.minor_axis.indexMap.items(): - for major_i, major_index in panel.major_axis.indexMap.items(): - try: - d[(minor_i, major_i, item)] = values[item_index, major_index, minor_index] - except: - pass - # stack the values - minor = sorted(list(minor)) - major = sorted(list(major)) - items = sorted(list(items)) - # create the 3d stack (items x columns x indicies) - data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan) - for item in items]) - for major_i in major]).transpose() - for minor_i in minor]) - # construct the panel - return Panel(data, items, major, minor) -add_class_method(Panel, create_panels_join, 'join_many') diff --git a/bench/bench_khash_dict.py b/bench/bench_khash_dict.py deleted file mode 100644 index 054fc36131b65..0000000000000 --- a/bench/bench_khash_dict.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -Some comparisons of khash.h to Python dict -""" -from __future__ import print_function - -import numpy as np -import os - -from vbench.api import Benchmark -from pandas.util.testing import rands -from pandas.compat import range -import pandas._tseries as lib -import pandas._sandbox as sbx -import time - -import psutil - -pid = os.getpid() -proc = psutil.Process(pid) - - -def object_test_data(n): - pass - - -def string_test_data(n): - return np.array([rands(10) for _ in range(n)], dtype='O') - - -def int_test_data(n): - return np.arange(n, dtype='i8') - -N = 1000000 - -#---------------------------------------------------------------------- -# Benchmark 1: map_locations - - -def map_locations_python_object(): - arr = string_test_data(N) - return _timeit(lambda: lib.map_indices_object(arr)) - - -def map_locations_khash_object(): - arr = string_test_data(N) - - def f(): - table = sbx.PyObjectHashTable(len(arr)) - table.map_locations(arr) - return _timeit(f) - - -def _timeit(f, iterations=10): - start = time.time() - for _ in range(iterations): - foo = f() - elapsed = time.time() - start - return elapsed - -#---------------------------------------------------------------------- -# Benchmark 2: lookup_locations - - -def lookup_python(values): - table = lib.map_indices_object(values) - return _timeit(lambda: lib.merge_indexer_object(values, table)) - - -def lookup_khash(values): - table = sbx.PyObjectHashTable(len(values)) - table.map_locations(values) - locs = table.lookup_locations(values) - # elapsed = _timeit(lambda: table.lookup_locations2(values)) - return table - - -def leak(values): - for _ in range(100): - print(proc.get_memory_info()) - table = lookup_khash(values) - # table.destroy() - -arr = string_test_data(N) - -#---------------------------------------------------------------------- -# Benchmark 3: unique - -#---------------------------------------------------------------------- -# Benchmark 4: factorize diff --git a/bench/bench_merge.R b/bench/bench_merge.R deleted file mode 100644 index 3ed4618494857..0000000000000 --- a/bench/bench_merge.R +++ /dev/null @@ -1,161 +0,0 @@ -library(plyr) -library(data.table) -N <- 10000 -indices = rep(NA, N) -indices2 = rep(NA, N) -for (i in 1:N) { - indices[i] <- paste(sample(letters, 10), collapse="") - indices2[i] <- paste(sample(letters, 10), collapse="") -} -left <- data.frame(key=rep(indices[1:8000], 10), - key2=rep(indices2[1:8000], 10), - value=rnorm(80000)) -right <- data.frame(key=indices[2001:10000], - key2=indices2[2001:10000], - value2=rnorm(8000)) - -right2 <- data.frame(key=rep(right$key, 2), - key2=rep(right$key2, 2), - value2=rnorm(16000)) - -left.dt <- data.table(left, key=c("key", "key2")) -right.dt <- data.table(right, key=c("key", "key2")) -right2.dt <- data.table(right2, key=c("key", "key2")) - -# left.dt2 <- data.table(left) -# right.dt2 <- data.table(right) - -## left <- data.frame(key=rep(indices[1:1000], 10), -## key2=rep(indices2[1:1000], 10), -## value=rnorm(100000)) -## right <- data.frame(key=indices[1:1000], -## key2=indices2[1:1000], -## value2=rnorm(10000)) - -timeit <- function(func, niter=10) { - timing = rep(NA, niter) - for (i in 1:niter) { - gc() - timing[i] <- system.time(func())[3] - } - mean(timing) -} - -left.join <- function(sort=FALSE) { - result <- base::merge(left, right, all.x=TRUE, sort=sort) -} - -right.join <- function(sort=FALSE) { - result <- base::merge(left, right, all.y=TRUE, sort=sort) -} - -outer.join <- function(sort=FALSE) { - result <- base::merge(left, right, all=TRUE, sort=sort) -} - -inner.join <- function(sort=FALSE) { - result <- base::merge(left, right, all=FALSE, sort=sort) -} - -left.join.dt <- function(sort=FALSE) { - result <- right.dt[left.dt] -} - -right.join.dt <- function(sort=FALSE) { - result <- left.dt[right.dt] -} - -outer.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right.dt, all=TRUE, sort=sort) -} - -inner.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right.dt, all=FALSE, sort=sort) -} - -plyr.join <- function(type) { - result <- plyr::join(left, right, by=c("key", "key2"), - type=type, match="first") -} - -sort.options <- c(FALSE, TRUE) - -# many-to-one - -results <- matrix(nrow=4, ncol=3) -colnames(results) <- c("base::merge", "plyr", "data.table") -rownames(results) <- c("inner", "outer", "left", "right") - -base.functions <- c(inner.join, outer.join, left.join, right.join) -plyr.functions <- c(function() plyr.join("inner"), - function() plyr.join("full"), - function() plyr.join("left"), - function() plyr.join("right")) -dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) -for (i in 1:4) { - base.func <- base.functions[[i]] - plyr.func <- plyr.functions[[i]] - dt.func <- dt.functions[[i]] - results[i, 1] <- timeit(base.func) - results[i, 2] <- timeit(plyr.func) - results[i, 3] <- timeit(dt.func) -} - - -# many-to-many - -left.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all.x=TRUE, sort=sort) -} - -right.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all.y=TRUE, sort=sort) -} - -outer.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all=TRUE, sort=sort) -} - -inner.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all=FALSE, sort=sort) -} - -left.join.dt <- function(sort=FALSE) { - result <- right2.dt[left.dt] -} - -right.join.dt <- function(sort=FALSE) { - result <- left.dt[right2.dt] -} - -outer.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right2.dt, all=TRUE, sort=sort) -} - -inner.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right2.dt, all=FALSE, sort=sort) -} - -sort.options <- c(FALSE, TRUE) - -# many-to-one - -results <- matrix(nrow=4, ncol=3) -colnames(results) <- c("base::merge", "plyr", "data.table") -rownames(results) <- c("inner", "outer", "left", "right") - -base.functions <- c(inner.join, outer.join, left.join, right.join) -plyr.functions <- c(function() plyr.join("inner"), - function() plyr.join("full"), - function() plyr.join("left"), - function() plyr.join("right")) -dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) -for (i in 1:4) { - base.func <- base.functions[[i]] - plyr.func <- plyr.functions[[i]] - dt.func <- dt.functions[[i]] - results[i, 1] <- timeit(base.func) - results[i, 2] <- timeit(plyr.func) - results[i, 3] <- timeit(dt.func) -} - diff --git a/bench/bench_merge.py b/bench/bench_merge.py deleted file mode 100644 index 330dba7b9af69..0000000000000 --- a/bench/bench_merge.py +++ /dev/null @@ -1,105 +0,0 @@ -import random -import gc -import time -from pandas import * -from pandas.compat import range, lrange, StringIO -from pandas.util.testing import rands - -N = 10000 -ngroups = 10 - - -def get_test_data(ngroups=100, n=N): - unique_groups = lrange(ngroups) - arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - - if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], - dtype=object) - - random.shuffle(arr) - return arr - -# aggregate multiple columns -# df = DataFrame({'key1' : get_test_data(ngroups=ngroups), -# 'key2' : get_test_data(ngroups=ngroups), -# 'data1' : np.random.randn(N), -# 'data2' : np.random.randn(N)}) - -# df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10), -# 'key2' : get_test_data(ngroups=ngroups//2, n=N//10), -# 'value' : np.random.randn(N // 10)}) -# result = merge.merge(df, df2, on='key2') - -N = 10000 - -indices = np.array([rands(10) for _ in range(N)], dtype='O') -indices2 = np.array([rands(10) for _ in range(N)], dtype='O') -key = np.tile(indices[:8000], 10) -key2 = np.tile(indices2[:8000], 10) - -left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) -right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - -right2 = right.append(right, ignore_index=True) - - -join_methods = ['inner', 'outer', 'left', 'right'] -results = DataFrame(index=join_methods, columns=[False, True]) -niter = 10 -for sort in [False, True]: - for join_method in join_methods: - f = lambda: merge(left, right, how=join_method, sort=sort) - gc.disable() - start = time.time() - for _ in range(niter): - f() - elapsed = (time.time() - start) / niter - gc.enable() - results[sort][join_method] = elapsed -# results.columns = ['pandas'] -results.columns = ['dont_sort', 'sort'] - - -# R results -# many to one -r_results = read_table(StringIO(""" base::merge plyr data.table -inner 0.2475 0.1183 0.1100 -outer 0.4213 0.1916 0.2090 -left 0.2998 0.1188 0.0572 -right 0.3102 0.0536 0.0376 -"""), sep='\s+') - -presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'}) -all_results = presults.join(r_results) - -all_results = all_results.div(all_results['pandas'], axis=0) - -all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', - 'base::merge']] - -sort_results = DataFrame.from_items([('pandas', results['sort']), - ('R', r_results['base::merge'])]) -sort_results['Ratio'] = sort_results['R'] / sort_results['pandas'] - - -nosort_results = DataFrame.from_items([('pandas', results['dont_sort']), - ('R', r_results['base::merge'])]) -nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas'] - -# many to many - -# many to one -r_results = read_table(StringIO("""base::merge plyr data.table -inner 0.4610 0.1276 0.1269 -outer 0.9195 0.1881 0.2725 -left 0.6559 0.1257 0.0678 -right 0.6425 0.0522 0.0428 -"""), sep='\s+') - -all_results = presults.join(r_results) -all_results = all_results.div(all_results['pandas'], axis=0) -all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', - 'base::merge']] diff --git a/bench/bench_merge_sqlite.py b/bench/bench_merge_sqlite.py deleted file mode 100644 index 3ad4b810119c3..0000000000000 --- a/bench/bench_merge_sqlite.py +++ /dev/null @@ -1,87 +0,0 @@ -import numpy as np -from collections import defaultdict -import gc -import time -from pandas import DataFrame -from pandas.util.testing import rands -from pandas.compat import range, zip -import random - -N = 10000 - -indices = np.array([rands(10) for _ in range(N)], dtype='O') -indices2 = np.array([rands(10) for _ in range(N)], dtype='O') -key = np.tile(indices[:8000], 10) -key2 = np.tile(indices2[:8000], 10) - -left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) -right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - -# right2 = right.append(right, ignore_index=True) -# right = right2 - -# random.shuffle(key2) -# indices2 = indices.copy() -# random.shuffle(indices2) - -# Prepare Database -import sqlite3 -create_sql_indexes = True - -conn = sqlite3.connect(':memory:') -conn.execute( - 'create table left( key varchar(10), key2 varchar(10), value int);') -conn.execute( - 'create table right( key varchar(10), key2 varchar(10), value2 int);') -conn.executemany('insert into left values (?, ?, ?)', - zip(key, key2, left['value'])) -conn.executemany('insert into right values (?, ?, ?)', - zip(right['key'], right['key2'], right['value2'])) - -# Create Indices -if create_sql_indexes: - conn.execute('create index left_ix on left(key, key2)') - conn.execute('create index right_ix on right(key, key2)') - - -join_methods = ['inner', 'left outer', 'left'] # others not supported -sql_results = DataFrame(index=join_methods, columns=[False]) -niter = 5 -for sort in [False]: - for join_method in join_methods: - sql = """CREATE TABLE test as select * - from left - %s join right - on left.key=right.key - and left.key2 = right.key2;""" % join_method - sql = """select * - from left - %s join right - on left.key=right.key - and left.key2 = right.key2;""" % join_method - - if sort: - sql = '%s order by key, key2' % sql - f = lambda: list(conn.execute(sql)) # list fetches results - g = lambda: conn.execute(sql) # list fetches results - gc.disable() - start = time.time() - # for _ in range(niter): - g() - elapsed = (time.time() - start) / niter - gc.enable() - - cur = conn.execute("DROP TABLE test") - conn.commit() - - sql_results[sort][join_method] = elapsed - sql_results.columns = ['sqlite3'] # ['dont_sort', 'sort'] - sql_results.index = ['inner', 'outer', 'left'] - - sql = """select * - from left - inner join right - on left.key=right.key - and left.key2 = right.key2;""" diff --git a/bench/bench_pivot.R b/bench/bench_pivot.R deleted file mode 100644 index 06dc6a105bc43..0000000000000 --- a/bench/bench_pivot.R +++ /dev/null @@ -1,27 +0,0 @@ -library(reshape2) - - -n <- 100000 -a.size <- 5 -b.size <- 5 - -data <- data.frame(a=sample(letters[1:a.size], n, replace=T), - b=sample(letters[1:b.size], n, replace=T), - c=rnorm(n), - d=rnorm(n)) - -timings <- numeric() - -# acast(melt(data, id=c("a", "b")), a ~ b, mean) -# acast(melt(data, id=c("a", "b")), a + b ~ variable, mean) - -for (i in 1:10) { - gc() - tim <- system.time(acast(melt(data, id=c("a", "b")), a ~ b, mean, - subset=.(variable=="c"))) - timings[i] = tim[3] -} - -mean(timings) - -acast(melt(data, id=c("a", "b")), a ~ b, mean, subset=.(variable="c")) diff --git a/bench/bench_pivot.py b/bench/bench_pivot.py deleted file mode 100644 index 007bd0aaebc2f..0000000000000 --- a/bench/bench_pivot.py +++ /dev/null @@ -1,16 +0,0 @@ -from pandas import * -import string - - -n = 100000 -asize = 5 -bsize = 5 - -letters = np.asarray(list(string.letters), dtype=object) - -data = DataFrame(dict(foo=letters[:asize][np.random.randint(0, asize, n)], - bar=letters[:bsize][np.random.randint(0, bsize, n)], - baz=np.random.randn(n), - qux=np.random.randn(n))) - -table = pivot_table(data, xby=['foo', 'bar']) diff --git a/bench/bench_take_indexing.py b/bench/bench_take_indexing.py deleted file mode 100644 index 5fb584bcfe45f..0000000000000 --- a/bench/bench_take_indexing.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import print_function -import numpy as np - -from pandas import * -import pandas._tseries as lib - -from pandas import DataFrame -import timeit -from pandas.compat import zip - -setup = """ -from pandas import Series -import pandas._tseries as lib -import random -import numpy as np - -import random -n = %d -k = %d -arr = np.random.randn(n, k) -indexer = np.arange(n, dtype=np.int32) -indexer = indexer[::-1] -""" - -sizes = [100, 1000, 10000, 100000] -iters = [1000, 1000, 100, 1] - -fancy_2d = [] -take_2d = [] -cython_2d = [] - -n = 1000 - - -def _timeit(stmt, size, k=5, iters=1000): - timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k)) - return timer.timeit(n) / n - -for sz, its in zip(sizes, iters): - print(sz) - fancy_2d.append(_timeit('arr[indexer]', sz, iters=its)) - take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its)) - cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its)) - -df = DataFrame({'fancy': fancy_2d, - 'take': take_2d, - 'cython': cython_2d}) - -print(df) - -from pandas.rpy.common import r -r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)') -r('set.seed(12345') -r('indexer <- sample(1:10000)') -r('mat[indexer,]') diff --git a/bench/bench_unique.py b/bench/bench_unique.py deleted file mode 100644 index 87bd2f2df586c..0000000000000 --- a/bench/bench_unique.py +++ /dev/null @@ -1,278 +0,0 @@ -from __future__ import print_function -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range, zip -import pandas._tseries as lib -import numpy as np -import matplotlib.pyplot as plt - -N = 50000 -K = 10000 - -groups = np.array([rands(10) for _ in range(K)], dtype='O') -groups2 = np.array([rands(10) for _ in range(K)], dtype='O') - -labels = np.tile(groups, N // K) -labels2 = np.tile(groups2, N // K) -data = np.random.randn(N) - - -def timeit(f, niter): - import gc - import time - gc.disable() - start = time.time() - for _ in range(niter): - f() - elapsed = (time.time() - start) / niter - gc.enable() - return elapsed - - -def algo1(): - unique_labels = np.unique(labels) - result = np.empty(len(unique_labels)) - for i, label in enumerate(unique_labels): - result[i] = data[labels == label].sum() - - -def algo2(): - unique_labels = np.unique(labels) - indices = lib.groupby_indices(labels) - result = np.empty(len(unique_labels)) - - for i, label in enumerate(unique_labels): - result[i] = data.take(indices[label]).sum() - - -def algo3_nosort(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(labels, sort=False) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - - -def algo3_sort(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(labels, sort=True) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - -import numpy as np -import random - - -# dict to hold results -counts = {} - -# a hack to generate random key, value pairs. -# 5k keys, 100k values -x = np.tile(np.arange(5000, dtype='O'), 20) -random.shuffle(x) -xarr = x -x = [int(y) for y in x] -data = np.random.uniform(0, 1, 100000) - - -def f(): - # groupby sum - for k, v in zip(x, data): - try: - counts[k] += v - except KeyError: - counts[k] = v - - -def f2(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(xarr, sort=False) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - - -def algo4(): - rizer = lib.DictFactorizer() - labs1, _ = rizer.factorize(labels, sort=False) - k1 = len(rizer.uniques) - - rizer = lib.DictFactorizer() - labs2, _ = rizer.factorize(labels2, sort=False) - k2 = len(rizer.uniques) - - group_id = labs1 * k2 + labs2 - max_group = k1 * k2 - - if max_group > 1e6: - rizer = lib.Int64Factorizer(len(group_id)) - group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True) - max_group = len(rizer.uniques) - - out = np.empty(max_group) - counts = np.zeros(max_group, dtype='i4') - lib.group_add(out, counts, data, group_id) - -# cumtime percall filename:lineno(function) -# 0.592 0.592 :1() - # 0.584 0.006 groupby_ex.py:37(algo3_nosort) - # 0.535 0.005 {method 'factorize' of DictFactorizer' objects} - # 0.047 0.000 {pandas._tseries.group_add} - # 0.002 0.000 numeric.py:65(zeros_like) - # 0.001 0.000 {method 'fill' of 'numpy.ndarray' objects} - # 0.000 0.000 {numpy.core.multiarray.empty_like} - # 0.000 0.000 {numpy.core.multiarray.empty} - -# UNIQUE timings - -# N = 10000000 -# K = 500000 - -# groups = np.array([rands(10) for _ in range(K)], dtype='O') - -# labels = np.tile(groups, N // K) -data = np.random.randn(N) - -data = np.random.randn(N) - -Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000] - -# Ks = [500000, 1000000, 2500000, 5000000, 10000000] - -import psutil -import os -import gc - -pid = os.getpid() -proc = psutil.Process(pid) - - -def dict_unique(values, expected_K, sort=False, memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - rizer = lib.DictFactorizer() - result = rizer.unique_int64(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - return result - - -def khash_unique(values, expected_K, size_hint=False, sort=False, - memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - if size_hint: - rizer = lib.Factorizer(len(values)) - else: - rizer = lib.Factorizer(100) - - result = [] - result = rizer.unique(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def khash_unique_str(values, expected_K, size_hint=False, sort=False, - memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - if size_hint: - rizer = lib.StringHashTable(len(values)) - else: - rizer = lib.StringHashTable(100) - - result = [] - result = rizer.unique(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def khash_unique_int64(values, expected_K, size_hint=False, sort=False): - if size_hint: - rizer = lib.Int64HashTable(len(values)) - else: - rizer = lib.Int64HashTable(100) - - result = [] - result = rizer.unique(values) - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def hash_bench(): - numpy = [] - dict_based = [] - dict_based_sort = [] - khash_hint = [] - khash_nohint = [] - for K in Ks: - print(K) - # groups = np.array([rands(10) for _ in range(K)]) - # labels = np.tile(groups, N // K).astype('O') - - groups = np.random.randint(0, long(100000000000), size=K) - labels = np.tile(groups, N // K) - dict_based.append(timeit(lambda: dict_unique(labels, K), 20)) - khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20)) - khash_hint.append(timeit(lambda: khash_unique_int64(labels, K, - size_hint=True), 20)) - - # memory, hard to get - # dict_based.append(np.mean([dict_unique(labels, K, memory=True) - # for _ in range(10)])) - # khash_nohint.append(np.mean([khash_unique(labels, K, memory=True) - # for _ in range(10)])) - # khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True) - # for _ in range(10)])) - - # dict_based_sort.append(timeit(lambda: dict_unique(labels, K, - # sort=True), 10)) - # numpy.append(timeit(lambda: np.unique(labels), 10)) - - # unique_timings = DataFrame({'numpy.unique' : numpy, - # 'dict, no sort' : dict_based, - # 'dict, sort' : dict_based_sort}, - # columns=['dict, no sort', - # 'dict, sort', 'numpy.unique'], - # index=Ks) - - unique_timings = DataFrame({'dict': dict_based, - 'khash, preallocate': khash_hint, - 'khash': khash_nohint}, - columns=['khash, preallocate', 'khash', 'dict'], - index=Ks) - - unique_timings.plot(kind='bar', legend=False) - plt.legend(loc='best') - plt.title('Unique on 100,000 values, int64') - plt.xlabel('Number of unique labels') - plt.ylabel('Mean execution time') - - plt.show() diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R deleted file mode 100644 index 69d0f7a9eec63..0000000000000 --- a/bench/bench_with_subset.R +++ /dev/null @@ -1,53 +0,0 @@ -library(microbenchmark) -library(data.table) - - -data.frame.subset.bench <- function (n=1e7, times=30) { - df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), - times=times)) -} - - -# data.table allows something very similar to query with an expression -# but we have chained comparisons AND we're faster BOO YAH! -data.table.subset.expression.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c], - times=times)) -} - - -# compare against subset with data.table for good measure -data.table.subset.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), - times=times)) -} - - -data.frame.with.bench <- function (n=1e7, times=30) { - df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - - print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), - times=times)) -} - - -data.table.with.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), - times=times)) -} - - -bench <- function () { - data.frame.subset.bench() - data.table.subset.expression.bench() - data.table.subset.bench() - data.frame.with.bench() - data.table.with.bench() -} - - -bench() diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py deleted file mode 100644 index 017401df3f7f3..0000000000000 --- a/bench/bench_with_subset.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python - -""" -Microbenchmarks for comparison with R's "with" and "subset" functions -""" - -from __future__ import print_function -import numpy as np -from numpy import array -from timeit import repeat as timeit -from pandas.compat import range, zip -from pandas import DataFrame - - -setup_common = """from pandas import DataFrame -from numpy.random import randn -df = DataFrame(randn(%d, 3), columns=list('abc')) -%s""" - - -setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" - - -def bench_with(n, times=10, repeat=3, engine='numexpr'): - return np.array(timeit('df.eval(s, engine=%r)' % engine, - setup=setup_common % (n, setup_with), - repeat=repeat, number=times)) / times - - -setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" - - -def bench_subset(n, times=10, repeat=3, engine='numexpr'): - return np.array(timeit('df.query(s, engine=%r)' % engine, - setup=setup_common % (n, setup_subset), - repeat=repeat, number=times)) / times - - -def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False): - r = np.logspace(mn, mx, num=num).round().astype(int) - - ev = DataFrame(np.empty((num, len(engines))), columns=engines) - qu = ev.copy(deep=True) - - ev['size'] = qu['size'] = r - - for engine in engines: - for i, n in enumerate(r): - if verbose: - print('engine: %r, i == %d' % (engine, i)) - ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine) - qu.loc[i, engine] = bench_subset(n, times=1, repeat=1, - engine=engine) - - return ev, qu - - -def plot_perf(df, engines, title, filename=None): - from matplotlib.pyplot import figure, rc - - try: - from mpltools import style - except ImportError: - pass - else: - style.use('ggplot') - - rc('text', usetex=True) - - fig = figure(figsize=(4, 3), dpi=100) - ax = fig.add_subplot(111) - - for engine in engines: - ax.plot(df.size, df[engine], label=engine, lw=2) - - ax.set_xlabel('Number of Rows') - ax.set_ylabel('Time (s)') - ax.set_title(title) - ax.legend(loc='best') - ax.tick_params(top=False, right=False) - - fig.tight_layout() - - if filename is not None: - fig.savefig(filename) - - -if __name__ == '__main__': - import os - import pandas as pd - - pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) - static_path = os.path.join(pandas_dir, 'doc', 'source', '_static') - - join = lambda p: os.path.join(static_path, p) - - fn = join('eval-query-perf-data.h5') - - engines = 'python', 'numexpr' - - if not os.path.exists(fn): - ev, qu = bench(verbose=True) - ev.to_hdf(fn, 'eval') - qu.to_hdf(fn, 'query') - else: - ev = pd.read_hdf(fn, 'eval') - qu = pd.read_hdf(fn, 'query') - - plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png')) - plot_perf(qu, engines, 'DataFrame.query()', - filename=join('query-perf.png')) - - plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()', - filename=join('eval-perf-small.png')) - plot_perf(qu[qu.size <= 500000], engines, 'DataFrame.query()', - filename=join('query-perf-small.png')) diff --git a/bench/better_unique.py b/bench/better_unique.py deleted file mode 100644 index e03a4f433ce66..0000000000000 --- a/bench/better_unique.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import print_function -from pandas import DataFrame -from pandas.compat import range, zip -import timeit - -setup = """ -from pandas import Series -import pandas._tseries as _tseries -from pandas.compat import range -import random -import numpy as np - -def better_unique(values): - uniques = _tseries.fast_unique(values) - id_map = _tseries.map_indices_buf(uniques) - labels = _tseries.get_unique_labels(values, id_map) - return uniques, labels - -tot = 100000 - -def get_test_data(ngroups=100, n=tot): - unique_groups = range(ngroups) - random.shuffle(unique_groups) - arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - - if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], - dtype=object) - - return arr - -arr = get_test_data(ngroups=%d) -""" - -group_sizes = [10, 100, 1000, 10000, - 20000, 30000, 40000, - 50000, 60000, 70000, - 80000, 90000, 100000] - -numbers = [100, 100, 50] + [10] * 10 - -numpy = [] -wes = [] - -for sz, n in zip(group_sizes, numbers): - # wes_timer = timeit.Timer(stmt='better_unique(arr)', - # setup=setup % sz) - wes_timer = timeit.Timer(stmt='_tseries.fast_unique(arr)', - setup=setup % sz) - - numpy_timer = timeit.Timer(stmt='np.unique(arr)', - setup=setup % sz) - - print(n) - numpy_result = numpy_timer.timeit(number=n) / n - wes_result = wes_timer.timeit(number=n) / n - - print('Groups: %d, NumPy: %s, Wes: %s' % (sz, numpy_result, wes_result)) - - wes.append(wes_result) - numpy.append(numpy_result) - -result = DataFrame({'wes': wes, 'numpy': numpy}, index=group_sizes) - - -def make_plot(numpy, wes): - pass - -# def get_test_data(ngroups=100, n=100000): -# unique_groups = range(ngroups) -# random.shuffle(unique_groups) -# arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - -# if len(arr) < n: -# arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], -# dtype=object) - -# return arr - -# arr = get_test_data(ngroups=1000) diff --git a/bench/duplicated.R b/bench/duplicated.R deleted file mode 100644 index eb2376df2932a..0000000000000 --- a/bench/duplicated.R +++ /dev/null @@ -1,22 +0,0 @@ -N <- 100000 - -k1 = rep(NA, N) -k2 = rep(NA, N) -for (i in 1:N){ - k1[i] <- paste(sample(letters, 1), collapse="") - k2[i] <- paste(sample(letters, 1), collapse="") -} -df <- data.frame(a=k1, b=k2, c=rep(1:100, N / 100)) -df2 <- data.frame(a=k1, b=k2) - -timings <- numeric() -timings2 <- numeric() -for (i in 1:50) { - gc() - timings[i] = system.time(deduped <- df[!duplicated(df),])[3] - gc() - timings2[i] = system.time(deduped <- df[!duplicated(df[,c("a", "b")]),])[3] -} - -mean(timings) -mean(timings2) diff --git a/bench/io_roundtrip.py b/bench/io_roundtrip.py deleted file mode 100644 index d87da0ec6321a..0000000000000 --- a/bench/io_roundtrip.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import print_function -import time -import os -import numpy as np - -import la -import pandas -from pandas.compat import range -from pandas import datetools, DatetimeIndex - - -def timeit(f, iterations): - start = time.clock() - - for i in range(iterations): - f() - - return time.clock() - start - - -def rountrip_archive(N, K=50, iterations=10): - # Create data - arr = np.random.randn(N, K) - # lar = la.larry(arr) - dma = pandas.DataFrame(arr, - DatetimeIndex('1/1/2000', periods=N, - offset=datetools.Minute())) - dma[201] = 'bar' - - # filenames - filename_numpy = '/Users/wesm/tmp/numpy.npz' - filename_larry = '/Users/wesm/tmp/archive.hdf5' - filename_pandas = '/Users/wesm/tmp/pandas_tmp' - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - - try: - os.unlink(filename_pandas) - except: - pass - - # Time a round trip save and load - # numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) - # numpy_time = timeit(numpy_f, iterations) / iterations - - # larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) - # larry_time = timeit(larry_f, iterations) / iterations - - pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pandas_time = timeit(pandas_f, iterations) / iterations - print('pandas (HDF5) %7.4f seconds' % pandas_time) - - pickle_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pickle_time = timeit(pickle_f, iterations) / iterations - print('pandas (pickle) %7.4f seconds' % pickle_time) - - # print('Numpy (npz) %7.4f seconds' % numpy_time) - # print('larry (HDF5) %7.4f seconds' % larry_time) - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - - try: - os.unlink(filename_pandas) - except: - pass - - -def numpy_roundtrip(filename, arr1, arr2): - np.savez(filename, arr1=arr1, arr2=arr2) - npz = np.load(filename) - arr1 = npz['arr1'] - arr2 = npz['arr2'] - - -def larry_roundtrip(filename, lar1, lar2): - io = la.IO(filename) - io['lar1'] = lar1 - io['lar2'] = lar2 - lar1 = io['lar1'] - lar2 = io['lar2'] - - -def pandas_roundtrip(filename, dma1, dma2): - # What's the best way to code this? - from pandas.io.pytables import HDFStore - store = HDFStore(filename) - store['dma1'] = dma1 - store['dma2'] = dma2 - dma1 = store['dma1'] - dma2 = store['dma2'] - - -def pandas_roundtrip_pickle(filename, dma1, dma2): - dma1.save(filename) - dma1 = pandas.DataFrame.load(filename) - dma2.save(filename) - dma2 = pandas.DataFrame.load(filename) - -if __name__ == '__main__': - rountrip_archive(10000, K=200) diff --git a/bench/larry.py b/bench/larry.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/bench/serialize.py b/bench/serialize.py deleted file mode 100644 index b0edd6a5752d2..0000000000000 --- a/bench/serialize.py +++ /dev/null @@ -1,89 +0,0 @@ -from __future__ import print_function -from pandas.compat import range, lrange -import time -import os -import numpy as np - -import la -import pandas - - -def timeit(f, iterations): - start = time.clock() - - for i in range(iterations): - f() - - return time.clock() - start - - -def roundtrip_archive(N, iterations=10): - - # Create data - arr = np.random.randn(N, N) - lar = la.larry(arr) - dma = pandas.DataFrame(arr, lrange(N), lrange(N)) - - # filenames - filename_numpy = '/Users/wesm/tmp/numpy.npz' - filename_larry = '/Users/wesm/tmp/archive.hdf5' - filename_pandas = '/Users/wesm/tmp/pandas_tmp' - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - try: - os.unlink(filename_pandas) - except: - pass - - # Time a round trip save and load - numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) - numpy_time = timeit(numpy_f, iterations) / iterations - - larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) - larry_time = timeit(larry_f, iterations) / iterations - - pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pandas_time = timeit(pandas_f, iterations) / iterations - - print('Numpy (npz) %7.4f seconds' % numpy_time) - print('larry (HDF5) %7.4f seconds' % larry_time) - print('pandas (HDF5) %7.4f seconds' % pandas_time) - - -def numpy_roundtrip(filename, arr1, arr2): - np.savez(filename, arr1=arr1, arr2=arr2) - npz = np.load(filename) - arr1 = npz['arr1'] - arr2 = npz['arr2'] - - -def larry_roundtrip(filename, lar1, lar2): - io = la.IO(filename) - io['lar1'] = lar1 - io['lar2'] = lar2 - lar1 = io['lar1'] - lar2 = io['lar2'] - - -def pandas_roundtrip(filename, dma1, dma2): - from pandas.io.pytables import HDFStore - store = HDFStore(filename) - store['dma1'] = dma1 - store['dma2'] = dma2 - dma1 = store['dma1'] - dma2 = store['dma2'] - - -def pandas_roundtrip_pickle(filename, dma1, dma2): - dma1.save(filename) - dma1 = pandas.DataFrame.load(filename) - dma2.save(filename) - dma2 = pandas.DataFrame.load(filename) diff --git a/bench/test.py b/bench/test.py deleted file mode 100644 index 2339deab313a1..0000000000000 --- a/bench/test.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np -import itertools -import collections -import scipy.ndimage as ndi -from pandas.compat import zip, range - -N = 10000 - -lat = np.random.randint(0, 360, N) -lon = np.random.randint(0, 360, N) -data = np.random.randn(N) - - -def groupby1(lat, lon, data): - indexer = np.lexsort((lon, lat)) - lat = lat.take(indexer) - lon = lon.take(indexer) - sorted_data = data.take(indexer) - - keys = 1000. * lat + lon - unique_keys = np.unique(keys) - bounds = keys.searchsorted(unique_keys) - - result = group_agg(sorted_data, bounds, lambda x: x.mean()) - - decoder = keys.searchsorted(unique_keys) - - return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) - - -def group_mean(lat, lon, data): - indexer = np.lexsort((lon, lat)) - lat = lat.take(indexer) - lon = lon.take(indexer) - sorted_data = data.take(indexer) - - keys = 1000 * lat + lon - unique_keys = np.unique(keys) - - result = ndi.mean(sorted_data, labels=keys, index=unique_keys) - decoder = keys.searchsorted(unique_keys) - - return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) - - -def group_mean_naive(lat, lon, data): - grouped = collections.defaultdict(list) - for lt, ln, da in zip(lat, lon, data): - grouped[(lt, ln)].append(da) - - averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items()) - - return averaged - - -def group_agg(values, bounds, f): - N = len(values) - result = np.empty(len(bounds), dtype=float) - for i, left_bound in enumerate(bounds): - if i == len(bounds) - 1: - right_bound = N - else: - right_bound = bounds[i + 1] - - result[i] = f(values[left_bound: right_bound]) - - return result - -# for i in range(10): -# groupby1(lat, lon, data) diff --git a/bench/zoo_bench.R b/bench/zoo_bench.R deleted file mode 100644 index 294d55f51a9ab..0000000000000 --- a/bench/zoo_bench.R +++ /dev/null @@ -1,71 +0,0 @@ -library(zoo) -library(xts) -library(fts) -library(tseries) -library(its) -library(xtable) - -## indices = rep(NA, 100000) -## for (i in 1:100000) -## indices[i] <- paste(sample(letters, 10), collapse="") - - - -## x <- zoo(rnorm(100000), indices) -## y <- zoo(rnorm(90000), indices[sample(1:100000, 90000)]) - -## indices <- as.POSIXct(1:100000) - -indices <- as.POSIXct(Sys.Date()) + seq(1, 100000000, 100) - -sz <- 500000 - -## x <- xts(rnorm(sz), sample(indices, sz)) -## y <- xts(rnorm(sz), sample(indices, sz)) - -zoo.bench <- function(){ - x <- zoo(rnorm(sz), sample(indices, sz)) - y <- zoo(rnorm(sz), sample(indices, sz)) - timeit(function() {x + y}) -} - -xts.bench <- function(){ - x <- xts(rnorm(sz), sample(indices, sz)) - y <- xts(rnorm(sz), sample(indices, sz)) - timeit(function() {x + y}) -} - -fts.bench <- function(){ - x <- fts(rnorm(sz), sort(sample(indices, sz))) - y <- fts(rnorm(sz), sort(sample(indices, sz)) - timeit(function() {x + y}) -} - -its.bench <- function(){ - x <- its(rnorm(sz), sort(sample(indices, sz))) - y <- its(rnorm(sz), sort(sample(indices, sz))) - timeit(function() {x + y}) -} - -irts.bench <- function(){ - x <- irts(sort(sample(indices, sz)), rnorm(sz)) - y <- irts(sort(sample(indices, sz)), rnorm(sz)) - timeit(function() {x + y}) -} - -timeit <- function(f){ - timings <- numeric() - for (i in 1:10) { - gc() - timings[i] = system.time(f())[3] - } - mean(timings) -} - -bench <- function(){ - results <- c(xts.bench(), fts.bench(), its.bench(), zoo.bench()) - names <- c("xts", "fts", "its", "zoo") - data.frame(results, names) -} - -result <- bench() diff --git a/bench/zoo_bench.py b/bench/zoo_bench.py deleted file mode 100644 index 74cb1952a5a2a..0000000000000 --- a/bench/zoo_bench.py +++ /dev/null @@ -1,36 +0,0 @@ -from pandas import * -from pandas.util.testing import rands - -n = 1000000 -# indices = Index([rands(10) for _ in xrange(n)]) - - -def sample(values, k): - sampler = np.random.permutation(len(values)) - return values.take(sampler[:k]) -sz = 500000 -rng = np.arange(0, 10000000000000, 10000000) -stamps = np.datetime64(datetime.now()).view('i8') + rng -idx1 = np.sort(sample(stamps, sz)) -idx2 = np.sort(sample(stamps, sz)) -ts1 = Series(np.random.randn(sz), idx1) -ts2 = Series(np.random.randn(sz), idx2) - - -# subsample_size = 90000 - -# x = Series(np.random.randn(100000), indices) -# y = Series(np.random.randn(subsample_size), -# index=sample(indices, subsample_size)) - - -# lx = larry(np.random.randn(100000), [list(indices)]) -# ly = larry(np.random.randn(subsample_size), [list(y.index)]) - -# Benchmark 1: Two 1-million length time series (int64-based index) with -# randomly chosen timestamps - -# Benchmark 2: Join two 5-variate time series DataFrames (outer and inner join) - -# df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5)) -# df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10)) diff --git a/doc/source/api.rst b/doc/source/api.rst index d6053791d6f4b..77d095a965221 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -270,6 +270,7 @@ Conversion :toctree: generated/ Series.astype + Series.infer_objects Series.copy Series.isnull Series.notnull @@ -777,6 +778,7 @@ Conversion DataFrame.astype DataFrame.convert_objects + DataFrame.infer_objects DataFrame.copy DataFrame.isnull DataFrame.notnull diff --git a/doc/source/basics.rst b/doc/source/basics.rst index d8b1602fb104d..aae1fffb7a3b6 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -2024,7 +2024,29 @@ object conversion ~~~~~~~~~~~~~~~~~ pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. -The following functions are available for one dimensional object arrays or scalars: +In cases where the data is already of the correct type, but stored in an ``object`` array, the +:meth:`DataFrame.infer_objects` and :meth:`Series.infer_objects` methods can be used to soft convert +to the correct type. + + .. ipython:: python + + import datetime + df = pd.DataFrame([[1, 2], + ['a', 'b'], + [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]]) + df = df.T + df + df.dtypes + +Because the data was transposed the original inference stored all columns as object, which +``infer_objects`` will correct. + + .. ipython:: python + + df.infer_objects().dtypes + +The following functions are available for one dimensional object arrays or scalars to perform +hard conversion of objects to a specified type: - :meth:`~pandas.to_numeric` (conversion to numeric dtypes) diff --git a/doc/source/conf.py b/doc/source/conf.py index 394fa44c30573..cb3063d59beae 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -17,6 +17,11 @@ import importlib from pandas.compat import u, PY3 +try: + raw_input # Python 2 +except NameError: + raw_input = input # Python 3 + # https://github.com/sphinx-doc/sphinx/pull/2325/files # Workaround for sphinx-build recursion limit overflow: # pickle.dump(doctree, f, pickle.HIGHEST_PROTOCOL) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9bf84e5419ffa..495d4e9c3a5a3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1258,7 +1258,8 @@ Files with Fixed Width Columns While ``read_csv`` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters -to ``read_fwf`` are largely the same as `read_csv` with two extra parameters: +to ``read_fwf`` are largely the same as `read_csv` with two extra parameters, and +a different usage of the ``delimiter`` parameter: - ``colspecs``: A list of pairs (tuples) giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). @@ -1267,6 +1268,9 @@ to ``read_fwf`` are largely the same as `read_csv` with two extra parameters: behaviour, if not specified, is to infer. - ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. + - ``delimiter``: Characters to consider as filler characters in the fixed-width file. + Can be used to specify the filler character of the fields + if it is not spaces (e.g., '~'). .. ipython:: python :suppress: diff --git a/doc/source/options.rst b/doc/source/options.rst index f373705a96f48..c585da64efece 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -304,7 +304,6 @@ display.float_format None The callable should accept a fl This is used in some places like SeriesFormatter. See core.format.EngFormatter for an example. -display.height 60 Deprecated. Use `display.max_rows` instead. display.large_repr truncate For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can show a truncated table (the default from 0.13), @@ -323,7 +322,6 @@ display.latex.multicolumn_format 'l' Alignment of multicolumn labels display.latex.multirow False Combines rows when using a MultiIndex. Centered instead of top-aligned, separated by clines. -display.line_width 80 Deprecated. Use `display.width` instead. display.max_columns 20 max_rows and max_columns are used in __repr__() methods to decide if to_string() or info() is used to diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 1dd80aec4fd6c..ce4a920ad77b5 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1092,10 +1092,10 @@ frequencies. We will refer to these aliases as *offset aliases* "BQ", "business quarter endfrequency" "QS", "quarter start frequency" "BQS", "business quarter start frequency" - "A", "year end frequency" - "BA", "business year end frequency" - "AS", "year start frequency" - "BAS", "business year start frequency" + "A, Y", "year end frequency" + "BA, BY", "business year end frequency" + "AS, YS", "year start frequency" + "BAS, BYS", "business year start frequency" "BH", "business hour frequency" "H", "hourly frequency" "T, min", "minutely frequency" diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7f1007d25e9ce..3cffab477e430 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,6 +25,39 @@ New features - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) + +.. _whatsnew_0210.enhancements.infer_objects: + +``infer_objects`` type conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`DataFrame.infer_objects` and :meth:`Series.infer_objects` +methods have been added to perform dtype inference on object columns, replacing +some of the functionality of the deprecated ``convert_objects`` +method. See the documentation :ref:`here ` +for more details. (:issue:`11221`) + +This method only performs soft conversions on object columns, converting Python objects +to native types, but not any coercive conversions. For example: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': np.array([1, 2, 3], dtype='object'), + 'C': ['1', '2', '3']}) + df.dtypes + df.infer_objects().dtypes + +Note that column ``'C'`` was not converted - only scalar numeric types +will be inferred to a new type. Other types of conversion should be accomplished +using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). + +.. ipython:: python + + df = df.infer_objects() + df['C'] = pd.to_numeric(df['C'], errors='coerce') + df.dtypes + .. _whatsnew_0210.enhancements.other: Other Enhancements @@ -40,6 +73,8 @@ Other Enhancements - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) +- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) +- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) .. _whatsnew_0210.api_breaking: @@ -124,6 +159,8 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel()` has dropped the ``has_index_names`` parameter (:issue:`10967`) +- The ``pd.options.display.height`` configuration has been dropped (:issue:`3663`) +- The ``pd.options.display.line_width`` configuration has been dropped (:issue:`2881`) - The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) - ``Index`` has dropped the ``.sym_diff()`` method in favor of ``.symmetric_difference()`` (:issue:`12591`) - ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) @@ -136,6 +173,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) .. _whatsnew_0210.bug_fixes: @@ -143,7 +181,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Fixes regression in 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) Conversion ^^^^^^^^^^ @@ -157,6 +194,8 @@ Indexing - Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). - Fixed :func:`TimedeltaIndex.get_loc` handling of ``np.timedelta64`` inputs (:issue:`16909`). - Fix :func:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). +- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) +- Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`) I/O ^^^ @@ -180,13 +219,15 @@ Groupby/Resample/Rolling Sparse ^^^^^^ - +- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16777`) Reshaping ^^^^^^^^^ - Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) - Merging with categorical date columns raised a TypeError (:issue:`16900`) +- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) +- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) Numeric @@ -202,3 +243,4 @@ Categorical Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) +- Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) diff --git a/doc/sphinxext/ipython_sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py index 49fbacba99592..922767a8e2d46 100644 --- a/doc/sphinxext/ipython_sphinxext/ipython_directive.py +++ b/doc/sphinxext/ipython_sphinxext/ipython_directive.py @@ -111,7 +111,7 @@ import sys import tempfile import ast -from pandas.compat import zip, range, map, lmap, u, cStringIO as StringIO +from pandas.compat import zip, range, map, lmap, u, text_type, cStringIO as StringIO import warnings # To keep compatibility with various python versions @@ -138,10 +138,8 @@ if PY3: from io import StringIO - text_type = str else: from StringIO import StringIO - text_type = unicode #----------------------------------------------------------------------------- # Globals diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b490bf787a037..79beb95d93ea1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -65,6 +65,8 @@ def _ensure_data(values, dtype=None): # we check some simple dtypes first try: + if is_object_dtype(dtype): + return _ensure_object(np.asarray(values)), 'object', 'object' if is_bool_dtype(values) or is_bool_dtype(dtype): # we are actually coercing to uint64 # until our algos suppport uint8 directly (see TODO) @@ -402,7 +404,10 @@ def isin(comps, values): # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x, values) - if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: + # GH16012 + # Ensure np.in1d doesn't get object types or it *may* throw an exception + if ((_np_version_under1p8 and compat.PY3) or len(comps) > 1000000 and + not is_object_dtype(comps)): f = lambda x, y: np.in1d(x, y) elif is_integer_dtype(comps): try: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index ae3001564a62f..06ce811703a8c 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -213,14 +213,6 @@ def use_numexpr_cb(key): (currently both are identical) """ -pc_line_width_deprecation_warning = """\ -line_width has been deprecated, use display.width instead (currently both are -identical) -""" - -pc_height_deprecation_warning = """\ -height has been deprecated. -""" pc_width_doc = """ : int @@ -383,14 +375,6 @@ def table_schema_cb(key): cf.register_option('html.border', 1, pc_html_border_doc, validator=is_int) - -cf.deprecate_option('display.line_width', - msg=pc_line_width_deprecation_warning, - rkey='display.width') - -cf.deprecate_option('display.height', msg=pc_height_deprecation_warning, - rkey='display.max_rows') - with cf.config_prefix('html'): cf.register_option('border', 1, pc_html_border_doc, validator=is_int) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9a79ca1d4eab1..e554e136cdb80 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3023,35 +3023,98 @@ def reset_index(self, level=None, drop=False, inplace=False, col_level=0, Examples -------- - >>> df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, - ... index=pd.Index(['a', 'b', 'c', 'd'], - ... name='idx')) + >>> df = pd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal NaN + + When we reset the index, the old index is added as a column, and a + new sequential index is used: + >>> df.reset_index() - idx a b - 0 a 1 5 - 1 b 2 6 - 2 c 3 7 - 3 d 4 8 - - >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', - ... 'foo', 'qux', 'qux']), - ... np.array(['one', 'two', 'one', 'two', 'one', 'two', - ... 'one', 'two'])] - >>> df2 = pd.DataFrame( - ... np.random.randn(8, 4), - ... index=pd.MultiIndex.from_arrays(arrays, - ... names=['a', 'b'])) - >>> df2.reset_index(level='a') - a 0 1 2 3 - b - one bar -1.099413 0.291838 0.598198 0.162181 - two bar -0.312184 -0.119904 0.250360 0.364378 - one baz 0.713596 -0.490636 0.074967 -0.297857 - two baz 0.998397 0.524499 -2.228976 0.901155 - one foo 0.923204 0.920695 1.264488 1.476921 - two foo -1.566922 0.783278 -0.073656 0.266027 - one qux -0.230470 0.109800 -1.383409 0.048421 - two qux -0.865993 -0.865984 0.705367 -0.170446 + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + We can use the `drop` parameter to avoid the old index being added as + a column: + + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal NaN + + You can also use `reset_index` with `MultiIndex`. + + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), + ... ('speed', 'type')]) + >>> df = pd.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=columns) + >>> df + speed + max type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey NaN jump + + If the index has multiple levels, we can reset a subset of them: + + >>> df.reset_index(level='class') + class speed + max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we are not dropping the index, by default, it is placed in the top + level. We can place it in another level: + + >>> df.reset_index(level='class', col_level=1) + speed + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + When the index is inserted under another level, we can specify under + which one with the parameter `col_fill`. If we specify a nonexistent + level, it is created: + + >>> df.reset_index(level='class', col_level=1, col_fill='species') + species speed + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump """ inplace = validate_bool_kwarg(inplace, 'inplace') if inplace: @@ -4652,6 +4715,11 @@ def append(self, other, ignore_index=False, verify_integrity=False): the DataFrame's index, the order of the columns in the resulting DataFrame will be unchanged. + Iteratively appending rows to a DataFrame can be more computationally + intensive than a single concatenate. A better solution is to append + those rows to a list and then concatenate the list with the original + DataFrame all at once. + See also -------- pandas.concat : General function to concatenate DataFrame, Series @@ -4682,6 +4750,33 @@ def append(self, other, ignore_index=False, verify_integrity=False): 2 5 6 3 7 8 + The following, while not recommended methods for generating DataFrames, + show two ways to generate a DataFrame from multiple data sources. + + Less efficient: + + >>> df = pd.DataFrame(columns=['A']) + >>> for i in range(5): + ... df = df.append({'A'}: i}, ignore_index=True) + >>> df + A + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + + More efficient: + + >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], + ... ignore_index=True) + A + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + """ if isinstance(other, (Series, dict)): if isinstance(other, dict): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e4e2e0093b1a6..b2083a4454f84 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3152,7 +3152,7 @@ def pipe(self, func, *args, **kwargs): (e.g., np.mean(arr_2d, axis=0)) as opposed to mimicking the default Numpy behavior (e.g., np.mean(arr_2d)). - agg is an alias for aggregate. Use it. + `agg` is an alias for `aggregate`. Use the alias. Returns ------- @@ -3161,7 +3161,7 @@ def pipe(self, func, *args, **kwargs): _shared_docs['transform'] = (""" Call function producing a like-indexed %(klass)s - and return a %(klass)s with the transformed values` + and return a %(klass)s with the transformed values .. versionadded:: 0.20.0 @@ -3671,9 +3671,12 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, converted : same as input object """ from warnings import warn - warn("convert_objects is deprecated. Use the data-type specific " - "converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.", - FutureWarning, stacklevel=2) + msg = ("convert_objects is deprecated. To re-infer data dtypes for " + "object columns, use {klass}.infer_objects()\nFor all " + "other conversions use the data-type specific converters " + "pd.to_datetime, pd.to_timedelta and pd.to_numeric." + ).format(klass=self.__class__.__name__) + warn(msg, FutureWarning, stacklevel=2) return self._constructor( self._data.convert(convert_dates=convert_dates, @@ -3681,6 +3684,53 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, convert_timedeltas=convert_timedeltas, copy=copy)).__finalize__(self) + def infer_objects(self): + """ + Attempt to infer better dtypes for object columns. + + Attempts soft conversion of object-dtyped + columns, leaving non-object and unconvertible + columns unchanged. The inference rules are the + same as during normal Series/DataFrame construction. + + .. versionadded:: 0.21.0 + + See Also + -------- + pandas.to_datetime : Convert argument to datetime. + pandas.to_timedelta : Convert argument to timedelta. + pandas.to_numeric : Convert argument to numeric typeR + + Returns + ------- + converted : same type as input object + + Examples + -------- + >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]}) + >>> df = df.iloc[1:] + >>> df + A + 1 1 + 2 2 + 3 3 + + >>> df.dtypes + A object + dtype: object + + >>> df.infer_objects().dtypes + A int64 + dtype: object + """ + # numeric=False necessary to only soft convert; + # python objects will still be converted to + # native numpy numeric types + return self._constructor( + self._data.convert(datetime=True, numeric=False, + timedelta=True, coerce=False, + copy=True)).__finalize__(self) + # ---------------------------------------------------------------------- # Filling NA's diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bbbc19b36964d..5d50f961927c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -666,7 +666,7 @@ def _try_convert_to_int_index(cls, data, copy, name): res = data.astype('u8', copy=False) if (res == data).all(): return UInt64Index(res, copy=copy, name=name) - except (TypeError, ValueError): + except (OverflowError, TypeError, ValueError): pass raise ValueError @@ -1640,7 +1640,7 @@ def __contains__(self, key): hash(key) try: return key in self._engine - except (TypeError, ValueError): + except (OverflowError, TypeError, ValueError): return False _index_shared_docs['contains'] = """ @@ -3365,7 +3365,7 @@ def _maybe_cast_indexer(self, key): ckey = int(key) if ckey == key: key = ckey - except (ValueError, TypeError): + except (OverflowError, ValueError, TypeError): pass return key diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c9e0e3b10875c..e8427f847dd2d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -419,7 +419,11 @@ def reindex(self, target, method=None, level=None, limit=None, raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) - new_target = self.take(indexer) + + if len(self.codes): + new_target = self.take(indexer) + else: + new_target = target # filling in missing if needed if len(missing): @@ -430,7 +434,6 @@ def reindex(self, target, method=None, level=None, limit=None, result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) - else: codes = new_target.codes.copy() diff --git a/pandas/core/series.py b/pandas/core/series.py index 4d5b718ce0ae9..c7ead292c8b63 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -37,7 +37,6 @@ maybe_convert_platform, maybe_cast_to_datetime, maybe_castable) from pandas.core.dtypes.missing import isnull, notnull, remove_na_arraylike - from pandas.core.common import (is_bool_indexer, _default_index, _asarray_tuplesafe, @@ -88,6 +87,17 @@ versionadded_to_excel='\n .. versionadded:: 0.20.0\n') +# see gh-16971 +def remove_na(arr): + """ + DEPRECATED : this function will be removed in a future version. + """ + + warnings.warn("remove_na is deprecated and is a private " + "function. Do not use.", FutureWarning, stacklevel=2) + return remove_na_arraylike(arr) + + def _coerce_method(converter): """ install the scalar coercion methods """ @@ -1553,6 +1563,18 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates + Notes + ----- + Iteratively appending to a Series can be more computationally intensive + than a single concatenate. A better solution is to append values to a + list and then concatenate the list with the original Series all at + once. + + See also + -------- + pandas.concat : General function to concatenate DataFrame, Series + or Panel objects + Returns ------- appended : Series diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 461dd50c5da6e..5fe96d70fc16f 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -143,7 +143,7 @@ def _init_dict(self, data, index, columns, dtype=None): sp_maker = lambda x: SparseArray(x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True, dtype=dtype) - sdict = DataFrame() + sdict = {} for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary @@ -163,11 +163,10 @@ def _init_dict(self, data, index, columns, dtype=None): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) - nan_vec = np.empty(len(index)) - nan_vec.fill(nan) - for c in columns: - if c not in sdict: - sdict[c] = sp_maker(nan_vec) + nan_arr = np.empty(len(index), dtype='float64') + nan_arr.fill(np.nan) + nan_arr = sp_maker(nan_arr) + sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 9dd061e26ba06..1bc9cf5379930 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -146,10 +146,9 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', data = data._data elif isinstance(data, (Series, dict)): - if index is None: - index = data.index.view() + data = Series(data, index=index) + index = data.index.view() - data = Series(data) res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index ab75e3fa253ce..bdff59939a4de 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -53,7 +53,7 @@ def get_console_size(): display_width = get_option('display.width') # deprecated. - display_height = get_option('display.height', silent=True) + display_height = get_option('display.max_rows') # Consider # interactive shell terminal, can detect term size @@ -71,7 +71,7 @@ def get_console_size(): # match default for width,height in config_init from pandas.core.config import get_default_val terminal_width = get_default_val('display.width') - terminal_height = get_default_val('display.height') + terminal_height = get_default_val('display.max_rows') else: # pure terminal terminal_width, terminal_height = get_terminal_size() diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b08d3877f3b03..d88a230b42403 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1054,9 +1054,9 @@ def highlight_max(self, subset=None, color='yellow', axis=0): subset: IndexSlice, default None a valid slice for ``data`` to limit the style application to color: str, default 'yellow' - axis: int, str, or None; default None - 0 or 'index' for columnwise, 1 or 'columns' for rowwise - or ``None`` for tablewise (the default) + axis: int, str, or None; default 0 + 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise, + or ``None`` for tablewise Returns ------- @@ -1076,9 +1076,9 @@ def highlight_min(self, subset=None, color='yellow', axis=0): subset: IndexSlice, default None a valid slice for ``data`` to limit the style application to color: str, default 'yellow' - axis: int, str, or None; default None - 0 or 'index' for columnwise, 1 or 'columns' for rowwise - or ``None`` for tablewise (the default) + axis: int, str, or None; default 0 + 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise, + or ``None`` for tablewise Returns ------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 343bc7a74fde8..1e7d9d420b35d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -63,8 +63,6 @@ file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv %s -delimiter : str, default ``None`` - Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the sep. Equivalent to setting ``sep='\s+'``. If this option @@ -316,7 +314,9 @@ be used automatically. In addition, separators longer than 1 character and different from ``'\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex - delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``""" + delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'`` +delimiter : str, default ``None`` + Alternative argument name for sep.""" _read_csv_doc = """ Read CSV (comma-separated) file into DataFrame @@ -341,15 +341,16 @@ widths : list of ints. optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. +delimiter : str, default ``'\t' + ' '`` + Characters to consider as filler characters in the fixed-width file. + Can be used to specify the filler character of the fields + if it is not spaces (e.g., '~'). """ _read_fwf_doc = """ Read a table of fixed-width formatted lines into DataFrame %s - -Also, 'delimiter' is used to specify the filler character of the -fields if it is not spaces (e.g., '~'). """ % (_parser_params % (_fwf_widths, '')) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b09325bfa2ddc..da1c68005b9b2 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1151,10 +1151,13 @@ def test_isin(self): expected = DataFrame([df.loc[s].isin(other) for s in df.index]) tm.assert_frame_equal(result, expected) - def test_isin_empty(self): + @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) + def test_isin_empty(self, empty): + # see gh-16991 df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) - result = df.isin([]) - expected = pd.DataFrame(False, df.index, df.columns) + expected = DataFrame(False, df.index, df.columns) + + result = df.isin(empty) tm.assert_frame_equal(result, expected) def test_isin_dict(self): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c1a5b437be5d0..f66070fd66813 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -495,6 +495,32 @@ def test_convert_objects_no_conversion(self): mixed2 = mixed1._convert(datetime=True) assert_frame_equal(mixed1, mixed2) + def test_infer_objects(self): + # GH 11221 + df = DataFrame({'a': ['a', 1, 2, 3], + 'b': ['b', 2.0, 3.0, 4.1], + 'c': ['c', datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)], + 'd': [1, 2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) + df = df.iloc[1:].infer_objects() + + assert df['a'].dtype == 'int64' + assert df['b'].dtype == 'float64' + assert df['c'].dtype == 'M8[ns]' + assert df['d'].dtype == 'object' + + expected = DataFrame({'a': [1, 2, 3], + 'b': [2.0, 3.0, 4.1], + 'c': [datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)], + 'd': [2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) + # reconstruct frame to verify inference is same + tm.assert_frame_equal(df.reset_index(drop=True), expected) + def test_stale_cached_series_bug_473(self): # this is chained, but ok diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 62686b356dc30..da4ca83c10dda 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -33,6 +33,31 @@ def test_date_range_gen_error(self): rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') assert len(rng) == 4 + @pytest.mark.parametrize("freq", ["AS", "YS"]) + def test_begin_year_alias(self, freq): + # see gh-9313 + rng = date_range("1/1/2013", "7/1/2017", freq=freq) + exp = pd.DatetimeIndex(["2013-01-01", "2014-01-01", + "2015-01-01", "2016-01-01", + "2017-01-01"], freq=freq) + tm.assert_index_equal(rng, exp) + + @pytest.mark.parametrize("freq", ["A", "Y"]) + def test_end_year_alias(self, freq): + # see gh-9313 + rng = date_range("1/1/2013", "7/1/2017", freq=freq) + exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31", + "2015-12-31", "2016-12-31"], freq=freq) + tm.assert_index_equal(rng, exp) + + @pytest.mark.parametrize("freq", ["BA", "BY"]) + def test_business_end_year_alias(self, freq): + # see gh-9313 + rng = date_range("1/1/2013", "7/1/2017", freq=freq) + exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31", + "2015-12-31", "2016-12-30"], freq=freq) + tm.assert_index_equal(rng, exp) + def test_date_range_negative_freq(self): # GH 11018 rng = date_range('2011-12-31', freq='-2A', periods=3) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 18dbe6624008a..692cdd4957947 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1407,6 +1407,15 @@ def check_idx(idx): # Float64Index overrides isin, so must be checked separately check_idx(Float64Index([1.0, 2.0, 3.0, 4.0])) + @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) + def test_isin_empty(self, empty): + # see gh-16991 + idx = Index(["a", "b"]) + expected = np.array([False, False]) + + result = idx.isin(empty) + tm.assert_numpy_array_equal(expected, result) + def test_boolean_cmp(self): values = [1, 2, 3, 4] diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 14f344acbefb2..e8d780e041316 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -419,6 +419,14 @@ def test_reindex_dtype(self): tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + def test_reindex_empty_index(self): + # See GH16770 + c = CategoricalIndex([]) + res, indexer = c.reindex(['a', 'b']) + tm.assert_index_equal(res, Index(['a', 'b']), exact=True) + tm.assert_numpy_array_equal(indexer, + np.array([-1, -1], dtype=np.intp)) + def test_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 9fa677eb624ae..98f5d5eb140df 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -63,6 +63,34 @@ def f(): pytest.raises(ValueError, f) + def test_inf_upcast(self): + # GH 16957 + # We should be able to use np.inf as a key + # np.inf should cause an index to convert to float + + # Test with np.inf in rows + df = pd.DataFrame(columns=[0]) + df.loc[1] = 1 + df.loc[2] = 2 + df.loc[np.inf] = 3 + + # make sure we can look up the value + assert df.loc[np.inf, 0] == 3 + + result = df.index + expected = pd.Float64Index([1, 2, np.inf]) + tm.assert_index_equal(result, expected) + + # Test with np.inf in columns + df = pd.DataFrame() + df.loc[0, 0] = 1 + df.loc[1, 1] = 2 + df.loc[0, np.inf] = 3 + + result = df.columns + expected = pd.Float64Index([0, 1, np.inf]) + tm.assert_index_equal(result, expected) + def test_setitem_dtype_upcast(self): # GH3216 @@ -542,6 +570,34 @@ def test_astype_assignment_with_dups(self): # result = df.get_dtype_counts().sort_index() # expected = Series({'float64': 2, 'object': 1}).sort_index() + @pytest.mark.parametrize("index,val", [ + (pd.Index([0, 1, 2]), 2), + (pd.Index([0, 1, '2']), '2'), + (pd.Index([0, 1, 2, np.inf, 4]), 4), + (pd.Index([0, 1, 2, np.nan, 4]), 4), + (pd.Index([0, 1, 2, np.inf]), np.inf), + (pd.Index([0, 1, 2, np.nan]), np.nan), + ]) + def test_index_contains(self, index, val): + assert val in index + + @pytest.mark.parametrize("index,val", [ + (pd.Index([0, 1, 2]), '2'), + (pd.Index([0, 1, '2']), 2), + (pd.Index([0, 1, 2, np.inf]), 4), + (pd.Index([0, 1, 2, np.nan]), 4), + (pd.Index([0, 1, 2, np.inf]), np.nan), + (pd.Index([0, 1, 2, np.nan]), np.inf), + # Checking if np.inf in Int64Index should not cause an OverflowError + # Related to GH 16957 + (pd.Int64Index([0, 1, 2]), np.inf), + (pd.Int64Index([0, 1, 2]), np.nan), + (pd.UInt64Index([0, 1, 2]), np.inf), + (pd.UInt64Index([0, 1, 2]), np.nan), + ]) + def test_index_not_contains(self, index, val): + assert val not in index + def test_index_type_coercion(self): with catch_warnings(record=True): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 679d43ac492ca..e1499565ce4a6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -302,7 +302,7 @@ def test_repr_non_interactive(self): df = DataFrame('hello', lrange(1000), lrange(5)) with option_context('mode.sim_interactive', False, 'display.width', 0, - 'display.height', 0, 'display.max_rows', 5000): + 'display.max_rows', 5000): assert not has_truncated_repr(df) assert not has_expanded_repr(df) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0bfeb5215f370..ec1d1a2a51cdc 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -405,3 +405,32 @@ def test_skiprows_inference_empty(self): with pytest.raises(EmptyDataError): read_fwf(StringIO(test), skiprows=3) + + def test_whitespace_preservation(self): + # Addresses Issue #16772 + data_expected = """ + a ,bbb + cc,dd """ + expected = read_csv(StringIO(data_expected), header=None) + + test_data = """ + a bbb + ccdd """ + result = read_fwf(StringIO(test_data), widths=[3, 3], + header=None, skiprows=[0], delimiter="\n\t") + + tm.assert_frame_equal(result, expected) + + def test_default_delimiter(self): + data_expected = """ +a,bbb +cc,dd""" + expected = read_csv(StringIO(data_expected), header=None) + + test_data = """ +a \tbbb +cc\tdd """ + result = read_fwf(StringIO(test_data), widths=[3, 3], + header=None, skiprows=[0]) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index d47a95924bd10..632d3b4ad2e7a 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -643,6 +643,10 @@ def test_dataframe_dummies_preserve_categorical_dtype(self): class TestGetDummiesSparse(TestGetDummies): sparse = True + @pytest.mark.xfail(reason='nan in index is problematic (GH 16894)') + def test_include_na(self): + super(TestGetDummiesSparse, self).test_include_na() + class TestMakeAxisDummies(object): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 749af1c56a7f0..7aab7df7169d4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1092,6 +1092,18 @@ def test_isin(self): expected = Series([True, False, True, False, False, False, True, True]) assert_series_equal(result, expected) + # GH: 16012 + # This specific issue has to have a series over 1e6 in len, but the + # comparison array (in_list) must be large enough so that numpy doesn't + # do a manual masking trick that will avoid this issue altogether + s = Series(list('abcdefghijk' * 10 ** 5)) + # If numpy doesn't do the manual comparison/mask, these + # unorderable mixed types are what cause the exception in numpy + in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E', + 'K', 'E', 'S', 'I', 'R', 'R'] * 6 + + assert s.isin(in_list).sum() == 200000 + def test_isin_with_string_scalar(self): # GH4763 s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) @@ -1135,6 +1147,15 @@ def test_isin_with_i8(self): result = s.isin(s[0:2]) assert_series_equal(result, expected) + @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) + def test_isin_empty(self, empty): + # see gh-16991 + s = Series(["a", "b"]) + expected = Series([False, False]) + + result = s.isin(empty) + tm.assert_series_equal(expected, result) + def test_timedelta64_analytics(self): from pandas import date_range diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 2ec579842e33f..c214280ee8386 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -268,3 +268,21 @@ def test_series_to_categorical(self): expected = Series(['a', 'b', 'c'], dtype='category') tm.assert_series_equal(result, expected) + + def test_infer_objects_series(self): + # GH 11221 + actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects() + expected = Series([1, 2, 3]) + tm.assert_series_equal(actual, expected) + + actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects() + expected = Series([1., 2., 3., np.nan]) + tm.assert_series_equal(actual, expected) + + # only soft conversions, uncovertable pass thru unchanged + actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) + .infer_objects()) + expected = Series([1, 2, 3, None, 'a']) + + assert actual.dtype == 'object' + tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 8e73c17684a16..b5948e75aa73e 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -15,6 +15,7 @@ MultiIndex, Index, Timestamp, NaT, IntervalIndex) from pandas.compat import range from pandas._libs.tslib import iNaT +from pandas.core.series import remove_na from pandas.util.testing import assert_series_equal, assert_frame_equal import pandas.util.testing as tm @@ -50,6 +51,11 @@ def _simple_ts(start, end, freq='D'): class TestSeriesMissingData(TestData): + def test_remove_na_deprecation(self): + # see gh-16971 + with tm.assert_produces_warning(FutureWarning): + remove_na(Series([])) + def test_timedelta_fillna(self): # GH 3371 s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index 654d12b782f37..a5d514644a8f1 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -1095,6 +1095,8 @@ def test_as_blocks(self): assert list(df_blocks.keys()) == ['float64'] tm.assert_frame_equal(df_blocks['float64'], df) + @pytest.mark.xfail(reason='nan column names in _init_dict problematic ' + '(GH 16894)') def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index b524d6bfab418..bb56f8a51897a 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -88,6 +88,24 @@ def setup_method(self, method): self.ziseries2 = SparseSeries(arr, index=index, kind='integer', fill_value=0) + def test_constructor_dict_input(self): + # gh-16905 + constructor_dict = {1: 1.} + index = [0, 1, 2] + + # Series with index passed in + series = pd.Series(constructor_dict) + expected = SparseSeries(series, index=index) + + result = SparseSeries(constructor_dict, index=index) + tm.assert_sp_series_equal(result, expected) + + # Series with index and dictionary with no index + expected = SparseSeries(series) + + result = SparseSeries(constructor_dict) + tm.assert_sp_series_equal(result, expected) + def test_constructor_dtype(self): arr = SparseSeries([np.nan, 1, 2, np.nan]) assert arr.dtype == np.float64 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 993dcc4f527b2..4588bf17fdbeb 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -597,6 +597,15 @@ def test_categorical_from_codes(self): result = algos.isin(Sd, St) tm.assert_numpy_array_equal(expected, result) + @pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) + def test_empty(self, empty): + # see gh-16991 + vals = pd.Index(["a", "b"]) + expected = np.array([False, False]) + + result = algos.isin(vals, empty) + tm.assert_numpy_array_equal(expected, result) + class TestValueCounts(object): diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 54d12317b0bf8..4bcd0b49db7e0 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -248,9 +248,10 @@ def test_anchored_shortcuts(self): # ensure invalid cases fail as expected invalid_anchors = ['SM-0', 'SM-28', 'SM-29', - 'SM-FOO', 'BSM', 'SM--1' + 'SM-FOO', 'BSM', 'SM--1', 'SMS-1', 'SMS-28', 'SMS-30', - 'SMS-BAR', 'BSMS', 'SMS--2'] + 'SMS-BAR', 'SMS-BYR' 'BSMS', + 'SMS--2'] for invalid_anchor in invalid_anchors: with tm.assert_raises_regex(ValueError, 'Invalid frequency: '): @@ -292,11 +293,15 @@ def test_get_rule_month(): result = frequencies._get_rule_month('A-DEC') assert (result == 'DEC') + result = frequencies._get_rule_month('Y-DEC') + assert (result == 'DEC') result = frequencies._get_rule_month(offsets.YearEnd()) assert (result == 'DEC') result = frequencies._get_rule_month('A-MAY') assert (result == 'MAY') + result = frequencies._get_rule_month('Y-MAY') + assert (result == 'MAY') result = frequencies._get_rule_month(offsets.YearEnd(month=5)) assert (result == 'MAY') @@ -305,6 +310,10 @@ def test_period_str_to_code(): assert (frequencies._period_str_to_code('A') == 1000) assert (frequencies._period_str_to_code('A-DEC') == 1000) assert (frequencies._period_str_to_code('A-JAN') == 1001) + assert (frequencies._period_str_to_code('Y') == 1000) + assert (frequencies._period_str_to_code('Y-DEC') == 1000) + assert (frequencies._period_str_to_code('Y-JAN') == 1001) + assert (frequencies._period_str_to_code('Q') == 2000) assert (frequencies._period_str_to_code('Q-DEC') == 2000) assert (frequencies._period_str_to_code('Q-FEB') == 2002) @@ -349,6 +358,10 @@ def test_freq_code(self): assert frequencies.get_freq('3A') == 1000 assert frequencies.get_freq('-1A') == 1000 + assert frequencies.get_freq('Y') == 1000 + assert frequencies.get_freq('3Y') == 1000 + assert frequencies.get_freq('-1Y') == 1000 + assert frequencies.get_freq('W') == 4000 assert frequencies.get_freq('W-MON') == 4001 assert frequencies.get_freq('W-FRI') == 4005 @@ -369,6 +382,13 @@ def test_freq_group(self): assert frequencies.get_freq_group('-1A') == 1000 assert frequencies.get_freq_group('A-JAN') == 1000 assert frequencies.get_freq_group('A-MAY') == 1000 + + assert frequencies.get_freq_group('Y') == 1000 + assert frequencies.get_freq_group('3Y') == 1000 + assert frequencies.get_freq_group('-1Y') == 1000 + assert frequencies.get_freq_group('Y-JAN') == 1000 + assert frequencies.get_freq_group('Y-MAY') == 1000 + assert frequencies.get_freq_group(offsets.YearEnd()) == 1000 assert frequencies.get_freq_group(offsets.YearEnd(month=1)) == 1000 assert frequencies.get_freq_group(offsets.YearEnd(month=5)) == 1000 @@ -790,12 +810,6 @@ def test_series(self): for freq in [None, 'L']: s = Series(period_range('2013', periods=10, freq=freq)) pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) - for freq in ['Y']: - - msg = frequencies._INVALID_FREQ_ERROR - with tm.assert_raises_regex(ValueError, msg): - s = Series(period_range('2013', periods=10, freq=freq)) - pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) # DateTimeIndex for freq in ['M', 'L', 'S']: @@ -812,11 +826,12 @@ def test_legacy_offset_warnings(self): 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR', 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN', 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC', - 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', 'WOM@4MON', - 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', 'WOM@4TUE', - 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', 'WOM@4WED', - 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', 'WOM@4THU' - 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', 'WOM@4FRI'] + 'Y@JAN', 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', + 'WOM@4MON', 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', + 'WOM@4TUE', 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', + 'WOM@4WED', 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', + 'WOM@4THU', 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', + 'WOM@4FRI'] msg = frequencies._INVALID_FREQ_ERROR for freq in freqs: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index c5f6c00a4005a..aa33a3849acb3 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -399,10 +399,14 @@ def _get_freq_str(base, mult=1): 'Q': 'Q', 'A': 'A', 'W': 'W', - 'M': 'M' + 'M': 'M', + 'Y': 'A', + 'BY': 'A', + 'YS': 'A', + 'BYS': 'A', } -need_suffix = ['QS', 'BQ', 'BQS', 'AS', 'BA', 'BAS'] +need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS'] for __prefix in need_suffix: for _m in tslib._MONTHS: _offset_to_period_map['%s-%s' % (__prefix, _m)] = \ @@ -427,9 +431,13 @@ def get_period_alias(offset_str): 'Q': 'Q-DEC', 'A': 'A-DEC', # YearEnd(month=12), + 'Y': 'A-DEC', 'AS': 'AS-JAN', # YearBegin(month=1), + 'YS': 'AS-JAN', 'BA': 'BA-DEC', # BYearEnd(month=12), + 'BY': 'BA-DEC', 'BAS': 'BAS-JAN', # BYearBegin(month=1), + 'BYS': 'BAS-JAN', 'Min': 'T', 'min': 'T', @@ -708,7 +716,17 @@ def get_standard_freq(freq): for _k, _v in compat.iteritems(_period_code_map): _reverse_period_code_map[_v] = _k -# Additional aliases +# Yearly aliases +year_aliases = {} + +for k, v in compat.iteritems(_period_code_map): + if k.startswith("A-"): + alias = "Y" + k[1:] + year_aliases[alias] = v + +_period_code_map.update(**year_aliases) +del year_aliases + _period_code_map.update({ "Q": 2000, # Quarterly - December year end (default quarterly) "A": 1000, # Annual diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 099761f38bb44..74ea120bf0b64 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -4,7 +4,7 @@ # copryright 2013, y-p @ github from __future__ import print_function -from pandas.compat import range, lrange, map +from pandas.compat import range, lrange, map, string_types, text_type """Search the git history for all commits touching a named method @@ -94,7 +94,7 @@ def get_hits(defname,files=()): def get_commit_info(c,fmt,sep='\t'): r=sh.git('log', "--format={}".format(fmt), '{}^..{}'.format(c,c),"-n","1",_tty_out=False) - return compat.text_type(r).split(sep) + return text_type(r).split(sep) def get_commit_vitals(c,hlen=HASH_LEN): h,s,d= get_commit_info(c,'%H\t%s\t%ci',"\t") @@ -183,11 +183,11 @@ def main(): !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! """) return - if isinstance(args.file_masks,compat.string_types): + if isinstance(args.file_masks, string_types): args.file_masks = args.file_masks.split(',') - if isinstance(args.path_masks,compat.string_types): + if isinstance(args.path_masks, string_types): args.path_masks = args.path_masks.split(',') - if isinstance(args.dir_masks,compat.string_types): + if isinstance(args.dir_masks, string_types): args.dir_masks = args.dir_masks.split(',') logger.setLevel(getattr(logging,args.debug_level)) diff --git a/scripts/windows_builder/build_27-32.bat b/scripts/windows_builder/build_27-32.bat deleted file mode 100644 index 37eb4d436d567..0000000000000 --- a/scripts/windows_builder/build_27-32.bat +++ /dev/null @@ -1,25 +0,0 @@ -@echo off -echo "starting 27-32" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.0\Bin\SetEnv.cmd" /x86 /release -set DISTUTILS_USE_SDK=1 - -title 27-32 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python27-32\python.exe setup.py build > build.27-32.log 2>&1 - -title "installing" -C:\python27-32\python.exe setup.py bdist --formats=wininst > install.27-32.log 2>&1 - -echo "testing" -C:\python27-32\scripts\nosetests -A "not slow" build\lib.win32-2.7\pandas > test.27-32.log 2>&1 - -echo "versions" -cd build\lib.win32-2.7 -C:\python27-32\python.exe ../../ci/print_versions.py > ../../versions.27-32.log 2>&1 - -exit - diff --git a/scripts/windows_builder/build_27-64.bat b/scripts/windows_builder/build_27-64.bat deleted file mode 100644 index e76e25d0ef39c..0000000000000 --- a/scripts/windows_builder/build_27-64.bat +++ /dev/null @@ -1,25 +0,0 @@ -@echo off -echo "starting 27-64" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.0\Bin\SetEnv.cmd" /x64 /release -set DISTUTILS_USE_SDK=1 - -title 27-64 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python27-64\python.exe setup.py build > build.27-64.log 2>&1 - -echo "installing" -C:\python27-64\python.exe setup.py bdist --formats=wininst > install.27-64.log 2>&1 - -echo "testing" -C:\python27-64\scripts\nosetests -A "not slow" build\lib.win-amd64-2.7\pandas > test.27-64.log 2>&1 - -echo "versions" -cd build\lib.win-amd64-2.7 -C:\python27-64\python.exe ../../ci/print_versions.py > ../../versions.27-64.log 2>&1 - -exit - diff --git a/scripts/windows_builder/build_34-32.bat b/scripts/windows_builder/build_34-32.bat deleted file mode 100644 index 8e060e000bc8f..0000000000000 --- a/scripts/windows_builder/build_34-32.bat +++ /dev/null @@ -1,27 +0,0 @@ -@echo off -echo "starting 34-32" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x86 /release -set DISTUTILS_USE_SDK=1 - -title 34-32 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python34-32\python.exe setup.py build > build.34-32.log 2>&1 - -echo "installing" -C:\python34-32\python.exe setup.py bdist --formats=wininst > install.34-32.log 2>&1 - -echo "testing" -C:\python34-32\scripts\nosetests -A "not slow" build\lib.win32-3.4\pandas > test.34-32.log 2>&1 - -echo "versions" -cd build\lib.win32-3.4 -C:\python34-32\python.exe ../../ci/print_versions.py > ../../versions.34-32.log 2>&1 - -exit - - - diff --git a/scripts/windows_builder/build_34-64.bat b/scripts/windows_builder/build_34-64.bat deleted file mode 100644 index 3a8512b730346..0000000000000 --- a/scripts/windows_builder/build_34-64.bat +++ /dev/null @@ -1,27 +0,0 @@ -@echo off -echo "starting 34-64" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 /release -set DISTUTILS_USE_SDK=1 - -title 34-64 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python34-64\python.exe setup.py build > build.34-64.log 2>&1 - -echo "installing" -C:\python34-64\python.exe setup.py bdist --formats=wininst > install.34-64.log 2>&1 - -echo "testing" -C:\python34-64\scripts\nosetests -A "not slow" build\lib.win-amd64-3.4\pandas > test.34-64.log 2>&1 - -echo "versions" -cd build\lib.win-amd64-3.4 -C:\python34-64\python.exe ../../ci/print_versions.py > ../../versions.34-64.log 2>&1 - -exit - - - diff --git a/scripts/windows_builder/check_and_build.bat b/scripts/windows_builder/check_and_build.bat deleted file mode 100644 index 32be1bde1f7f3..0000000000000 --- a/scripts/windows_builder/check_and_build.bat +++ /dev/null @@ -1,2 +0,0 @@ -set PYTHONPATH=c:/python27-64/lib -c:/python27-64/python.exe c:/Builds/check_and_build.py %1 %2 %3 %4 %4 %6 %7 %8 %9 diff --git a/scripts/windows_builder/check_and_build.py b/scripts/windows_builder/check_and_build.py deleted file mode 100644 index 2eb32fb4265d9..0000000000000 --- a/scripts/windows_builder/check_and_build.py +++ /dev/null @@ -1,194 +0,0 @@ -import datetime -import git -import logging -import os, re, time -import subprocess -import argparse -import pysftp - -# parse the args -parser = argparse.ArgumentParser(description='build, test, and install updated versions of master pandas') -parser.add_argument('-b', '--build', - help='run just this build', - dest='build') -parser.add_argument('-u', '--update', - help='get a git update', - dest='update', - action='store_true', - default=False) -parser.add_argument('-t', '--test', - help='run the tests', - dest='test', - action='store_true', - default=False) -parser.add_argument('-c', '--compare', - help='show the last tests compare', - dest='compare', - action='store_true', - default=False) -parser.add_argument('-v', '--version', - help='show the last versions', - dest='version', - action='store_true', - default=False) -parser.add_argument('-i', '--install', - help='run the install', - dest='install', - action='store_true', - default=False) -parser.add_argument('--dry', - help='dry run', - dest='dry', - action='store_true', - default=False) - -args = parser.parse_args() -dry_run = args.dry - -builds = ['27-32','27-64','34-32','34-64'] -base_dir = "C:\Users\Jeff Reback\Documents\GitHub\pandas" -remote_host='pandas.pydata.org' -username='pandas' -password=############ - -# drop python from our environment to avoid -# passing this onto sub-processes -env = os.environ -del env['PYTHONPATH'] - -# the stdout logger -fmt = '%(asctime)s: %(message)s' -logger = logging.getLogger('check_and_build') -logger.setLevel(logging.DEBUG) -stream_handler = logging.StreamHandler() -stream_handler.setFormatter(logging.Formatter(fmt)) -logger.addHandler(stream_handler) - -def run_all(test=False,compare=False,install=False,version=False,build=None): - # run everything - - for b in builds: - if build is not None and build != b: - continue - if test: - do_rebuild(b) - if compare or test: - try: - do_compare(b) - except (Exception) as e: - logger.info("ERROR COMPARE {0} : {1}".format(b,e)) - if version: - try: - do_version(b) - except (Exception) as e: - logger.info("ERROR VERSION {0} : {1}".format(b,e)) - - if install: - run_install() - -def do_rebuild(build): - # trigger the rebuild - - cmd = "c:/Builds/build_{0}.bat".format(build) - logger.info("rebuild : {0}".format(cmd)) - p = subprocess.Popen("start /wait /min {0}".format(cmd),env=env,shell=True,close_fds=True) - ret = p.wait() - -def do_compare(build): - # print the test outputs - - f = os.path.join(base_dir,"test.{0}.log".format(build)) - with open(f,'r') as fh: - for l in fh: - l = l.rstrip() - if l.startswith('ERROR:'): - logger.info("{0} : {1}".format(build,l)) - if l.startswith('Ran') or l.startswith('OK') or l.startswith('FAIL'): - logger.info("{0} : {1}".format(build,l)) - -def do_version(build): - # print the version strings - - f = os.path.join(base_dir,"versions.{0}.log".format(build)) - with open(f,'r') as fh: - for l in fh: - l = l.rstrip() - logger.info("{0} : {1}".format(build,l)) - -def do_update(is_verbose=True): - # update git; return True if the commit has changed - - repo = git.Repo(base_dir) - master = repo.heads.master - origin = repo.remotes.origin - start_commit = master.commit - - if is_verbose: - logger.info("current commit : {0}".format(start_commit)) - - try: - origin.update() - except (Exception) as e: - logger.info("update exception : {0}".format(e)) - try: - origin.pull() - except (Exception) as e: - logger.info("pull exception : {0}".format(e)) - - result = start_commit != master.commit - if result: - if is_verbose: - logger.info("commits changed : {0} -> {1}".format(start_commit,master.commit)) - return result - -def run_install(): - # send the installation binaries - - repo = git.Repo(base_dir) - master = repo.heads.master - commit = master.commit - short_hash = str(commit)[:7] - - logger.info("sending files : {0}".format(commit)) - d = os.path.join(base_dir,"dist") - files = [ f for f in os.listdir(d) if re.search(short_hash,f) ] - srv = pysftp.Connection(host=remote_host,username=username,password=password) - srv.chdir("www/pandas-build/dev") - - # get current files - remote_files = set(srv.listdir(path='.')) - - for f in files: - if f not in remote_files: - logger.info("sending: {0}".format(f)) - local = os.path.join(d,f) - srv.put(localpath=local) - - srv.close() - logger.info("sending files: done") - -# just perform the action -if args.update or args.test or args.compare or args.install or args.version: - if args.update: - do_update() - run_all(test=args.test,compare=args.compare,install=args.install,version=args.version,build=args.build) - exit(0) - -# file logging -file_handler = logging.FileHandler("C:\Builds\logs\check_and_build.log") -file_handler.setFormatter(logging.Formatter(fmt)) -logger.addHandler(file_handler) - -logger.info("start") - -# main loop -while(True): - - if do_update(): - run_all(test=True,install=False) - - time.sleep(60*60) - -logger.info("exit") -file_handler.close() - diff --git a/scripts/windows_builder/readme.txt b/scripts/windows_builder/readme.txt deleted file mode 100644 index 789e2a9ee0c63..0000000000000 --- a/scripts/windows_builder/readme.txt +++ /dev/null @@ -1,17 +0,0 @@ -This is a collection of windows batch scripts (and a python script) -to rebuild the binaries, test, and upload the binaries for public distribution -upon a commit on github. - -Obviously requires that these be setup on windows -Requires an install of Windows SDK 3.5 and 4.0 -Full python installs for each version with the deps - -Currently supporting - -27-32,27-64,34-32,34-64 - -Note that 34 use the 4.0 SDK, while the other suse 3.5 SDK - -I installed these scripts in C:\Builds - -Installed libaries in C:\Installs