diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 3a3b3d5e36977..6320be3920730 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -652,11 +652,11 @@ Since version 0.15, pandas can include categorical data in a ``DataFrame``. For # Alternative: df["grade"] = df["raw_grade"].astype("category") df["grade"] - # Rename the levels - df["grade"].cat.levels = ["very good", "good", "very bad"] + # Rename the categories inplace + df["grade"].cat.categories = ["very good", "good", "very bad"] - # Reorder the levels and simultaneously add the missing levels - df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + # Reorder the categories and simultaneously add the missing categories + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df["grade"] df.sort("grade") df.groupby("grade").size() diff --git a/doc/source/api.rst b/doc/source/api.rst index f831b97d2033f..8598bae5758c9 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -555,26 +555,33 @@ Categorical .. currentmodule:: pandas.core.categorical -If the Series is of dtype ``category``, ``Series.cat`` can be used to access the the underlying -``Categorical``. This accessor is similar to the ``Series.dt`` or ``Series.str``and has the +If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical +data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the following usable methods and properties (all available as ``Series.cat.``). .. autosummary:: :toctree: generated/ - Categorical.levels + Categorical.categories Categorical.ordered - Categorical.reorder_levels - Categorical.remove_unused_levels + Categorical.rename_categories + Categorical.reorder_categories + Categorical.add_categories + Categorical.remove_categories + Categorical.remove_unused_categories + Categorical.set_categories + Categorical.codes + +To create a Series of dtype ``category``, use ``cat = s.astype("category")``. -The following methods are considered API when using ``Categorical`` directly: +The following two ``Categorical`` constructors are considered API but should only be used when +adding ordering information or special categories is need at creation time of the categorical data: .. autosummary:: :toctree: generated/ Categorical Categorical.from_codes - Categorical.codes ``np.asarray(categorical)`` works by implementing the array interface. Be aware, that this converts the Categorical back to a numpy array, so levels and order information is not preserved! diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 6ed1a7982a64b..a5b00bbc4722f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -24,55 +24,51 @@ Categorical .. note:: While there was in `pandas.Categorical` in earlier versions, the ability to use - `Categorical` data in `Series` and `DataFrame` is new. + categorical data in `Series` and `DataFrame` is new. -This is a introduction to pandas :class:`pandas.Categorical` type, including a short comparison -with R's `factor`. +This is a introduction to pandas categorical data type, including a short comparison +with R's ``factor``. `Categoricals` are a pandas data type, which correspond to categorical variables in statistics: a variable, which can take on only a limited, and usually fixed, -number of possible values (commonly called `levels`). Examples are gender, social class, +number of possible values (`categories`; `categories` in R). Examples are gender, social class, blood types, country affiliations, observation time or ratings via Likert scales. -In contrast to statistical categorical variables, a `Categorical` might have an order (e.g. +In contrast to statistical categorical variables, categorical data might have an order (e.g. 'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical operations (additions, divisions, ...) are not possible. -All values of the `Categorical` are either in `levels` or `np.nan`. Order is defined by -the order of the `levels`, not lexical order of the values. Internally, the data structure -consists of a levels array and an integer array of `codes` which point to the real value in the -levels array. +All values of categorical data are either in `categories` or `np.nan`. Order is defined by +the order of `categories`, not lexical order of the values. Internally, the data structure +consists of a `categories` array and an integer array of `codes` which point to the real value in +the `categories` array. -`Categoricals` are useful in the following cases: +The categorical data type is useful in the following cases: * A string variable consisting of only a few different values. Converting such a string variable to a categorical variable will save some memory. * The lexical order of a variable is not the same as the logical order ("one", "two", "three"). - By converting to a categorical and specifying an order on the levels, sorting and + By converting to a categorical and specifying an order on the categories, sorting and min/max will use the logical order instead of the lexical order. * As a signal to other python libraries that this column should be treated as a categorical - variable (e.g. to use suitable statistical methods or plot types) + variable (e.g. to use suitable statistical methods or plot types). -See also the :ref:`API docs on Categoricals`. +See also the :ref:`API docs on categoricals`. Object Creation --------------- -Categorical `Series` or columns in a `DataFrame` can be crated in several ways: +Categorical `Series` or columns in a `DataFrame` can be created in several ways: -By passing a `Categorical` object to a `Series` or assigning it to a `DataFrame`: +By specifying ``dtype="category"`` when constructing a `Series`: .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"]) - s = pd.Series(raw_cat) + s = pd.Series(["a","b","c","a"], dtype="category") s - df = pd.DataFrame({"A":["a","b","c","a"]}) - df["B"] = raw_cat - df -By converting an existing `Series` or column to a ``category`` type: +By converting an existing `Series` or column to a ``category`` dtype: .. ipython:: python @@ -92,7 +88,21 @@ By using some special functions: See :ref:`documentation ` for :func:`~pandas.cut`. -`Categoricals` have a specific ``category`` :ref:`dtype `: +By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. +This is the only possibility to specify differently ordered categories (or no order at all) at +creation time and the only reason to use :class:`pandas.Categorical` directly: + +.. ipython:: python + + raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], + ordered=False) + s = pd.Series(raw_cat) + s + df = pd.DataFrame({"A":["a","b","c","a"]}) + df["B"] = raw_cat + df + +Categorical data has a specific ``category`` :ref:`dtype `: .. ipython:: python @@ -100,13 +110,13 @@ See :ref:`documentation ` for :func:`~pandas.cut`. .. note:: - In contrast to R's `factor` function, a `Categorical` is not converting input values to - string and levels will end up the same data type as the original values. + In contrast to R's `factor` function, categorical data is not converting input values to + strings and categories will end up the same data type as the original values. .. note:: In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `levels` to change the levels after creation time. + creation time. Use `categories` to change the categories after creation time. To get back to the original Series or `numpy` array, use ``Series.astype(original_dtype)`` or ``np.asarray(categorical)``: @@ -119,152 +129,145 @@ To get back to the original Series or `numpy` array, use ``Series.astype(origina s2 s3 = s2.astype('string') s3 - np.asarray(s2.cat) + np.asarray(s2) -If you have already `codes` and `levels`, you can use the :func:`~pandas.Categorical.from_codes` +If you have already `codes` and `categories`, you can use the :func:`~pandas.Categorical.from_codes` constructor to save the factorize step during normal constructor mode: .. ipython:: python splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - pd.Categorical.from_codes(splitter, levels=["train", "test"]) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Description ----------- -Using ``.describe()`` on a ``Categorical(...)`` or a ``Series(Categorical(...))`` will show -different output. - - -As part of a `Dataframe` or as a `Series` a similar output as for a `Series` of type ``string`` is -shown. Calling ``Categorical.describe()`` will show the frequencies for each level, with NA for -unused levels. +Using ``.describe()`` on categorical data will produce similar output to a `Series` or +`DataFrame` of type ``string``. .. ipython:: python - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) df = pd.DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) df.describe() - cat.describe() + df["cat"].describe() -Working with levels -------------------- +Working with categories +----------------------- -`Categoricals` have a `levels` property, which list their possible values. If you don't -manually specify levels, they are inferred from the passed in values. `Series` of type -``category`` expose the same interface via their `cat` property. +Categorical data has a `categories` and a `ordered` property, which list their possible values and +whether the ordering matters or not. These properties are exposed as ``s.cat.categories`` and +``s.cat.ordered``. If you don't manually specify categories and ordering, they are inferred from the +passed in values. .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"]) - raw_cat.levels - raw_cat.ordered - # Series of type "category" also expose these interface via the .cat property: - s = pd.Series(raw_cat) - s.cat.levels + s = pd.Series(["a","b","c","a"], dtype="category") + s.cat.categories s.cat.ordered -.. note:: - New `Categorical` are automatically ordered if the passed in values are sortable or a - `levels` argument is supplied. This is a difference to R's `factors`, which are unordered - unless explicitly told to be ordered (``ordered=TRUE``). - -It's also possible to pass in the levels in a specific order: +It's also possible to pass in the categories in a specific order: .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"], levels=["c","b","a"]) - s = pd.Series(raw_cat) - s.cat.levels + s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"])) + s.cat.categories s.cat.ordered .. note:: - - Passing in a `levels` argument implies ``ordered=True``. You can of course overwrite that by + New categorical data is automatically ordered if the passed in values are sortable or a + `categories` argument is supplied. This is a difference to R's `factors`, which are unordered + unless explicitly told to be ordered (``ordered=TRUE``). You can of course overwrite that by passing in an explicit ``ordered=False``. -Any value omitted in the levels argument will be replaced by `np.nan`: -.. ipython:: python - - raw_cat = pd.Categorical(["a","b","c","a"], levels=["a","b"]) - s = pd.Series(raw_cat) - s.cat.levels - s +Renaming categories +******************* -Renaming levels is done by assigning new values to the ``Category.levels`` or -``Series.cat.levels`` property: +Renaming categories is done by assigning new values to the ``Series.cat.categories`` property or +by using the :func:`Categorical.rename_categories` method: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","a"])) - s - s.cat.levels = ["Group %s" % g for g in s.cat.levels] + s = pd.Series(["a","b","c","a"], dtype="category") s - s.cat.levels = [1,2,3] + s.cat.categories = ["Group %s" % g for g in s.cat.categories] s + s.cat.rename_categories([1,2,3]) .. note:: - I contrast to R's `factor`, a `Categorical` can have levels of other types than string. + In contrast to R's `factor`, categorical data can have categories of other types than string. -Levels must be unique or a `ValueError` is raised: +.. note:: + + Be aware that assigning new categories is an inplace operations, while most other operation + under ``Series.cat`` per default return a new Series of dtype `category`. + +Categories must be unique or a `ValueError` is raised: .. ipython:: python try: - s.cat.levels = [1,1,1] + s.cat.categories = [1,1,1] except ValueError as e: print("ValueError: " + str(e)) -Appending levels can be done by assigning a levels list longer than the current levels: +Appending new categories +************************ + +Appending categories can be done by using the :func:`Categorical.add_categories` method: .. ipython:: python - s.cat.levels = [1,2,3,4] - s.cat.levels + s = s.cat.add_categories([4]) + s.cat.categories s -.. note:: - Adding levels in other positions can be done with ``.reorder_levels()``. +Removing categories +******************* -Removing a level is also possible, but only the last level(s) can be removed by assigning a -shorter list than current levels. Values which are omitted are replaced by ``np.nan``. +Removing categories can be done by using the :func:`Categorical.remove_categories` method. Values +which are removed are replaced by ``np.nan``.: .. ipython:: python - s.cat.levels = [1,2] + s = s.cat.remove_categories([4]) s -.. note:: +Renaming unused categories +************************** - It's only possible to remove or add a level at the last position. If that's not where you want - to remove an old or add a new level, use ``Category.reorder_levels(new_order)`` or - ``Series.cat.reorder_levels(new_order)`` methods before or after. - -Removing unused levels can also be done: +Removing unused categories can also be done: .. ipython:: python - raw = pd.Categorical(["a","b","a"], levels=["a","b","c","d"]) - c = pd.Series(raw) - raw - raw.remove_unused_levels() - raw - c.cat.remove_unused_levels() - c + s = pd.Series(pd.Categorical(["a","b","a"], categories=["a","b","c","d"])) + s + s.cat.remove_unused_categories() -.. note:: +Setting categories +****************** + +If you want to do remove and add new categories in one step (which has some speed advantage), +or simply set the categories to a predefined scale, use :func:`Categorical.set_categories`. + +.. ipython:: python - In contrast to R's `factor` function, passing a `Categorical` as the sole input to the - `Categorical` constructor will *not* remove unused levels but create a new `Categorical` - which is equal to the passed in one! + s = pd.Series(["one","two","four", "-"], dtype="category") + s + s = s.cat.set_categories(["one","two","three","four"]) + s +.. note:: + Be aware that :func:`Categorical.set_categories` cannot know whether some category is omitted + intentionally or because it is misspelled or (under Python3) due to a type difference (e.g., + numpys S1 dtype and python strings). This can result in surprising behaviour! Ordered or not... ----------------- -If a `Categoricals` is ordered (``cat.ordered == True``), then the order of the levels has a +If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a meaning and certain operations are possible. If the categorical is unordered, a `TypeError` is raised. @@ -275,88 +278,81 @@ raised. s.sort() except TypeError as e: print("TypeError: " + str(e)) - s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=True)) + s = pd.Series(["a","b","c","a"], dtype="category") # ordered per default! s.sort() s - print(s.min(), s.max()) + s.min(), s.max() -.. note:: - ``ordered=True`` is not necessary needed in the second case, as lists of strings are sortable - and so the resulting `Categorical` is ordered. - -Sorting will use the order defined by levels, not any lexical order present on the data type. +Sorting will use the order defined by categories, not any lexical order present on the data type. This is even true for strings and numeric data: .. ipython:: python - s = pd.Series(pd.Categorical([1,2,3,1])) - s.cat.levels = [2,3,1] + s = pd.Series([1,2,3,1], dtype="category") + s.cat.categories = [2,3,1] s s.sort() s print(s.min(), s.max()) -Reordering the levels is possible via the ``Categorical.reorder_levels(new_levels)`` or -``Series.cat.reorder_levels(new_levels)`` methods. All old levels must be included in the new -levels. +Reordering the categories is possible via the :func:`Categorical.reorder_categories` and +the :func:`Categorical.set_categories` methods. For :func:`Categorical.reorder_categories`, all +old categories must be included in the new categories and no new categories are allowed. .. ipython:: python - s2 = pd.Series(pd.Categorical([1,2,3,1])) - s2.cat.reorder_levels([2,3,1]) - s2 - s2.sort() - s2 - print(s2.min(), s2.max()) - + s = pd.Series([1,2,3,1], dtype="category") + s = s.cat.reorder_categories([2,3,1]) + s + s.sort() + s + print(s.min(), s.max()) .. note:: - Note the difference between assigning new level names and reordering the levels: the first - renames levels and therefore the individual values in the `Series`, but if the first + + Note the difference between assigning new categories and reordering the categories: the first + renames categories and therefore the individual values in the `Series`, but if the first position was sorted last, the renamed value will still be sorted last. Reordering means that the way values are sorted is different afterwards, but not that individual values in the `Series` are changed. -You can also add new levels with :func:`Categorical.reorder_levels`, as long as you include all -old levels: - -.. ipython:: python - - s3 = pd.Series(pd.Categorical(["a","b","d"])) - s3.cat.reorder_levels(["a","b","c","d"]) - s3 +.. note:: + If the `Categorical` is not ordered, ``Series.min()`` and ``Series.max()`` will raise + `TypeError`. Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them + (e.g.``Series.median()``, which would need to compute the mean between two values if the length + of an array is even) do not work and raise a `TypeError`. -Operations ----------- -The following operations are possible with categorical data: +Comparisons +----------- Comparing `Categoricals` with other objects is possible in two cases: - * comparing a `Categorical` to another `Categorical`, when `level` and `ordered` is the same or - * comparing a `Categorical` to a scalar. + * comparing a categorical Series to another categorical Series, when `categories` and `ordered` is + the same or + * comparing a categorical Series to a scalar. All other comparisons will raise a TypeError. .. ipython:: python - cat = pd.Series(pd.Categorical([1,2,3], levels=[3,2,1])) - cat_base = pd.Series(pd.Categorical([2,2,2], levels=[3,2,1])) + cat = pd.Series(pd.Categorical([1,2,3], categories=[3,2,1])) + cat_base = pd.Series(pd.Categorical([2,2,2], categories=[3,2,1])) cat_base2 = pd.Series(pd.Categorical([2,2,2])) cat cat_base cat_base2 -Comparing to a categorical with the same levels and ordering or to a scalar works: +Comparing to a categorical with the same categories and ordering or to a scalar works: .. ipython:: python cat > cat_base cat > 2 -This doesn't work because the levels are not the same: +This doesn't work because the categories are not the same: .. ipython:: python @@ -367,10 +363,11 @@ This doesn't work because the levels are not the same: .. note:: - Comparisons with `Series`, `np.array` or a `Categorical` with different levels or ordering - will raise an `TypeError` because custom level ordering would result in two valid results: - one with taking in account the ordering and one without. If you want to compare a `Categorical` - with such a type, you need to be explicit and convert the `Categorical` to values: + Comparisons with `Series`, `np.array` or a `Categorical` with different categories or ordering + will raise an `TypeError` because custom categories ordering could be interpreted in two ways: + one with taking in account the ordering and one without. If you want to compare a categorical + series with such a type, you need to be explicit and convert the categorical data back to the + original values: .. ipython:: python @@ -383,50 +380,29 @@ This doesn't work because the levels are not the same: np.asarray(cat) > base -Getting the minimum and maximum, if the categorical is ordered: - -.. ipython:: python - - s = pd.Series(pd.Categorical(["a","b","c","a"], levels=["c","a","b","d"])) - print(s.min(), s.max()) - -.. note:: - - If the `Categorical` is not ordered, ``Categorical.min()`` and ``Categorical.max()`` and the - corresponding operations on `Series` will raise `TypeError`. - -The mode: - -.. ipython:: python - - raw_cat = pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"]) - s = pd.Series(raw_cat) - raw_cat.mode() - s.mode() - -.. note:: +Operations +---------- - Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them (e.g. - ``.median()``, which would need to compute the mean between two values if the length of an - array is even) do not work and raise a `TypeError`. +Apart from ``Series.min()``, ``Series.max()`` and ``Series.mode()``, the following operations are +possible with categorical data: -`Series` methods like `Series.value_counts()` will use all levels, even if some levels are not +`Series` methods like `Series.value_counts()` will use all categories, even if some categories are not present in the data: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a","b","c","c"], categories=["c","a","b","d"])) s.value_counts() -Groupby will also show "unused" levels: +Groupby will also show "unused" categories: .. ipython:: python - cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c","d"]) + cats = pd.Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) df.groupby("cats").mean() - cats2 = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) + cats2 = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) df2.groupby(["cats","B"]).mean() @@ -435,7 +411,7 @@ Pivot tables: .. ipython:: python - raw_cat = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) + raw_cat = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) pd.pivot_table(df, values='values', index=['A', 'B']) @@ -444,7 +420,7 @@ Data munging The optimized pandas data access methods ``.loc``, ``.iloc``, ``.ix`` ``.at``, and ``.iat``, work as normal, the only difference is the return type (for getting) and -that only values already in the levels can be assigned. +that only values already in `categories` can be assigned. Getting ~~~~~~~ @@ -454,8 +430,8 @@ the ``category`` dtype is preserved. .. ipython:: python - cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c"]) idx = pd.Index(["h","i","j","k","l","m","n",]) + cats = pd.Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) values= [1,2,2,2,3,4,5] df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] @@ -464,7 +440,7 @@ the ``category`` dtype is preserved. df.ix["h":"j",0:1] df[df["cats"] == "b"] -An example where the `Categorical` is not preserved is if you take one single row: the +An example where the category type is not preserved is if you take one single row: the resulting `Series` is of dtype ``object``: .. ipython:: python @@ -472,20 +448,20 @@ resulting `Series` is of dtype ``object``: # get the complete "h" row as a Series df.loc["h", :] -Returning a single item from a `Categorical` will also return the value, not a `Categorical` +Returning a single item from categorical data will also return the value, not a categorical of length "1". .. ipython:: python df.iat[0,0] - df["cats"].cat.levels = ["x","y","z"] + df["cats"].cat.categories = ["x","y","z"] df.at["h","cats"] # returns a string .. note:: This is a difference to R's `factor` function, where ``factor(c(1,2,3))[1]`` returns a single value `factor`. -To get a single value `Series` of type ``category`` pass in a single value list: +To get a single value `Series` of type ``category`` pass in a list with a single value: .. ipython:: python @@ -495,12 +471,12 @@ Setting ~~~~~~~ Setting values in a categorical column (or `Series`) works as long as the value is included in the -`levels`: +`categories`: .. ipython:: python - cats = pd.Categorical(["a","a","a","a","a","a","a"], levels=["a","b"]) idx = pd.Index(["h","i","j","k","l","m","n"]) + cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) values = [1,1,1,1,1,1,1] df = pd.DataFrame({"cats":cats,"values":values}, index=idx) @@ -511,14 +487,14 @@ Setting values in a categorical column (or `Series`) works as long as the value except ValueError as e: print("ValueError: " + str(e)) -Setting values by assigning a `Categorical` will also check that the `levels` match: +Setting values by assigning categorical data will also check that the `categories` match: .. ipython:: python - df.loc["j":"k","cats"] = pd.Categorical(["a","a"], levels=["a","b"]) + df.loc["j":"k","cats"] = pd.Categorical(["a","a"], categories=["a","b"]) df try: - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) except ValueError as e: print("ValueError: " + str(e)) @@ -527,8 +503,8 @@ Assigning a `Categorical` to parts of a column of other types will use the value .. ipython:: python df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) - df.loc[1:2,"a"] = pd.Categorical(["b","b"], levels=["a","b"]) - df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) df df.dtypes @@ -537,29 +513,29 @@ Merging ~~~~~~~ You can concat two `DataFrames` containing categorical data together, -but the levels of these `Categoricals` need to be the same: +but the categories of these categoricals need to be the same: .. ipython:: python - cat = pd.Categorical(["a","b"], levels=["a","b"]) + cat = pd.Series(["a","b"], dtype="category") vals = [1,2] df = pd.DataFrame({"cats":cat, "vals":vals}) res = pd.concat([df,df]) res res.dtypes -In this case the levels are not the same and so an error is raised: +In this case the categories are not the same and so an error is raised: .. ipython:: python df_different = df.copy() - df_different["cats"].cat.levels = ["a","b","c"] + df_different["cats"].cat.categories = ["c","d"] try: pd.concat([df,df_different]) except ValueError as e: print("ValueError: " + str(e)) -The same applies to ``df.append(df)``. +The same applies to ``df.append(df_different)``. Getting Data In/Out ------------------- @@ -568,8 +544,8 @@ Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dt raise ``NotImplementedError``. Writing to a CSV file will convert the data, effectively removing any information about the -`Categorical` (levels and ordering). So if you read back the CSV file you have to convert the -relevant columns back to `category` and assign the right levels and level ordering. +categorical (categories and ordering). So if you read back the CSV file you have to convert the +relevant columns back to `category` and assign the right categories and categories ordering. .. ipython:: python :suppress: @@ -579,10 +555,10 @@ relevant columns back to `category` and assign the right levels and level orderi .. ipython:: python s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) - # rename the levels - s.cat.levels = ["very good", "good", "bad"] - # reorder the levels and add missing levels - s.cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + # rename the categories + s.cat.categories = ["very good", "good", "bad"] + # reorder the categories and add missing categories + s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) csv = StringIO() df.to_csv(csv) @@ -591,7 +567,8 @@ relevant columns back to `category` and assign the right levels and level orderi df2["cats"] # Redo the category df2["cats"] = df2["cats"].astype("category") - df2["cats"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + df2["cats"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"], + inplace=True) df2.dtypes df2["cats"] @@ -603,46 +580,88 @@ pandas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section ` -There are two ways a `np.nan` can be represented in `Categorical`: either the value is not -available ("missing value") or `np.nan` is a valid level. +There are two ways a `np.nan` can be represented in categorical data: either the value is not +available ("missing value") or `np.nan` is a valid category. .. ipython:: python - s = pd.Series(pd.Categorical(["a","b",np.nan,"a"])) + s = pd.Series(["a","b",np.nan,"a"], dtype="category") + # only two categories s - # only two levels - s.cat.levels - s2 = pd.Series(pd.Categorical(["a","b","c","a"])) - s2.cat.levels = [1,2,np.nan] + s2 = pd.Series(["a","b","c","a"], dtype="category") + s2.cat.categories = [1,2,np.nan] + # three categories, np.nan included s2 - # three levels, np.nan included - # Note: as int arrays can't hold NaN the levels were converted to object - s2.cat.levels + +.. note:: + As integer `Series` can't include NaN, the categories were converted to `object`. .. note:: Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as - `np.nan` levels into account: + `np.nan` categories into account: .. ipython:: python - c = pd.Categorical(["a","b",np.nan]) - c.levels = ["a","b",np.nan] - # will be inserted as a NA level: + c = pd.Series(["a","b",np.nan], dtype="category") + c.cat.set_categories(["a","b",np.nan], inplace=True) + # will be inserted as a NA category: c[0] = np.nan s = pd.Series(c) s pd.isnull(s) s.fillna("a") +Differences to R's `factor` +--------------------------- + +The following differences to R's factor functions can be observed: + +* R's `levels` are named `categories` +* R's `levels` are always of type string, while `categories` in pandas can be of any dtype. +* New categorical data is automatically ordered if the passed in values are sortable or a + `categories` argument is supplied. This is a difference to R's `factors`, which are unordered + unless explicitly told to be ordered (``ordered=TRUE``). +* It's not possible to specify labels at creation time. Use ``s.cat.rename_categories(new_labels)`` + afterwards. +* In contrast to R's `factor` function, using categorical data as the sole input to create a + new categorical series will *not* remove unused categories but create a new categorical series + which is equal to the passed in one! + Gotchas ------- +Old style constructor usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In earlier versions than pandas 0.15, a `Categorical` could be constructed by passing in precomputed +`codes` (called then `labels`) instead of values with categories. The `codes` were interpreted as +pointers to the categories with `-1` as `NaN`. This type of constructor useage is replaced by +the special constructor :func:`Categorical.from_codes`. + +Unfortunately, in some special cases, using code which assumes the old style constructor usage +will work with the current pandas version, resulting in subtle bugs: + +.. code-block:: python + + >>> cat = pd.Categorical([1,2], [1,2,3]) + >>> # old version + >>> cat.get_values() + array([2, 3], dtype=int64) + >>> # new version + >>> cat.get_values() + array([1, 2], dtype=int64) + +.. warning:: + If you used `Categoricals` with older versions of pandas, please audit your code before + upgrading and change your code to use the :func:`~pandas.Categorical.from_codes` + constructor. + `Categorical` is not a `numpy` array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Currently, `Categorical` and the corresponding ``category`` `Series` is implemented as a python -object and not as a low level `numpy` array dtype. This leads to some problems. +Currently, categorical data and the underlying `Categorical` is implemented as a python +object and not as a low-level `numpy` array dtype. This leads to some problems. `numpy` itself doesn't know about the new `dtype`: @@ -667,7 +686,7 @@ Dtype comparisons work: np.str_ == dtype Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` -are not numeric data (even in the case that ``.levels`` is numeric). +are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python @@ -681,6 +700,42 @@ are not numeric data (even in the case that ``.levels`` is numeric). .. note:: If such a function works, please file a bug at https://github.com/pydata/pandas! +dtype in apply +~~~~~~~~~~~~~~ + +Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get +a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a +basic type) and applying along columns will also convert to object. + +.. ipython:: python + + df = pd.DataFrame({"a":[1,2,3,4], + "b":["a","b","c","d"], + "cats":pd.Categorical([1,2,3,2])}) + df.apply(lambda row: type(row["cats"]), axis=1) + df.apply(lambda col: col.dtype, axis=0) + +No categorical index +~~~~~~~~~~~~~~~~~~~~ + +There is currently no index of type ``category``, so setting the index to categorical column will +convert the categorical data to a "normal" dtype first and therefore remove any custom +ordering of the categories: + +.. ipython:: python + + cats = pd.Categorical([1,2,3,4], categories=[4,2,3,1]) + strings = ["a","b","c","d"] + values = [4,2,3,1] + df = pd.DataFrame({"strings":strings, "values":values}, index=cats) + df.index + # This should sort by categories but does not as there is no CategoricalIndex! + df.sort_index() + +.. note:: + This could change if a `CategoricalIndex` is implemented (see + https://github.com/pydata/pandas/issues/7629) + Side effects ~~~~~~~~~~~~ @@ -690,115 +745,31 @@ means that changes to the `Series` will in most cases change the original `Categ .. ipython:: python - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) s = pd.Series(cat, name="cat") cat s.iloc[0:2] = 10 cat df = pd.DataFrame(s) - df["cat"].cat.levels = [1,2,3,4,5] + df["cat"].cat.categories = [1,2,3,4,5] cat -Use ``copy=True`` to prevent such a behaviour: +Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categoricals`: .. ipython:: python - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) s = pd.Series(cat, name="cat", copy=True) cat s.iloc[0:2] = 10 cat .. note:: - This also happens in some cases when you supply a `numpy` array instea dof a `Categorical`: - using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, but using + This also happens in some cases when you supply a `numpy` array instead of a `Categorical`: + using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, while using a string array (e.g. ``np.array(["a","b","c","a"])``) will not. -Danger of confusion -~~~~~~~~~~~~~~~~~~~ - -Both `Series` and `Categorical` have a method ``.reorder_levels()`` but for different things. For -Series of type ``category`` this means that there is some danger to confuse both methods. - -.. ipython:: python - - s = pd.Series(pd.Categorical([1,2,3,4])) - print(s.cat.levels) - # wrong and raises an error: - try: - s.reorder_levels([4,3,2,1]) - except Exception as e: - print("Exception: " + str(e)) - # right - s.cat.reorder_levels([4,3,2,1]) - print(s.cat.levels) - -See also the API documentation for :func:`pandas.Series.reorder_levels` and -:func:`pandas.Categorical.reorder_levels` - -Old style constructor usage -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -I earlier versions, a `Categorical` could be constructed by passing in precomputed `codes` -(called then `labels`) instead of values with levels. The `codes` are interpreted as pointers -to the levels with `-1` as `NaN`. This usage is now deprecated and not available unless -``compat=True`` is passed to the constructor of `Categorical`. - -.. ipython:: python - :okwarning: - - # This raises a FutureWarning: - cat = pd.Categorical([1,2], levels=[1,2,3], compat=True) - cat.get_values() - -In the default case (``compat=False``) the first argument is interpreted as values. - -.. ipython:: python - - cat = pd.Categorical([1,2], levels=[1,2,3], compat=False) - cat.get_values() - -.. warning:: - Using Categorical with precomputed codes and levels is deprecated and a `FutureWarning` - is raised. Please change your code to use the :func:`~pandas.Categorical.from_codes` - constructor instead of adding ``compat=False``. - -No categorical index -~~~~~~~~~~~~~~~~~~~~ - -There is currently no index of type ``category``, so setting the index to a `Categorical` will -convert the `Categorical` to a normal `numpy` array first and therefore remove any custom -ordering of the levels: - -.. ipython:: python - - cats = pd.Categorical([1,2,3,4], levels=[4,2,3,1]) - strings = ["a","b","c","d"] - values = [4,2,3,1] - df = pd.DataFrame({"strings":strings, "values":values}, index=cats) - df.index - # This should sort by levels but does not as there is no CategoricalIndex! - df.sort_index() - -.. note:: - This could change if a `CategoricalIndex` is implemented (see - https://github.com/pydata/pandas/issues/7629) - -dtype in apply -~~~~~~~~~~~~~~ - -Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get -a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a -basic type) and applying along columns will also convert to object. - -.. ipython:: python - - df = pd.DataFrame({"a":[1,2,3,4], "b":["a","b","c","d"], "cats":pd.Categorical([1,2,3,2])}) - df.apply(lambda row: type(row["cats"]), axis=1) - df.apply(lambda col: col.dtype, axis=0) - - Future compatibility ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 1e52d7e20046e..d71775772eba4 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -40,6 +40,13 @@ users upgrade to this version. but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) +.. warning:: + + The refactorings in :class:`~pandas.Categorical` changed the two argument constructor from + "codes/labels and levels" to "values and levels". This can lead to subtle bugs. If you use + :class:`~pandas.Categorical` directly, please audit your code before updating to this pandas + version and change it to use the :meth:`~pandas.Categorical.from_codes` constructor. + .. _whatsnew_0150.api: API changes @@ -563,11 +570,11 @@ For full docs, see the :ref:`Categorical introduction ` and the # Alternative: df["grade"] = df["raw_grade"].astype("category") df["grade"] - # Rename the levels - df["grade"].cat.levels = ["very good", "good", "very bad"] + # Rename the categories + df["grade"].cat.categories = ["very good", "good", "very bad"] - # Reorder the levels and simultaneously add the missing levels - df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + # Reorder the categories and simultaneously add the missing categories + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df["grade"] df.sort("grade") df.groupby("grade").size() @@ -575,9 +582,10 @@ For full docs, see the :ref:`Categorical introduction ` and the - ``pandas.core.group_agg`` and ``pandas.core.factor_agg`` were removed. As an alternative, construct a dataframe and use ``df.groupby().agg()``. -- Supplying "codes/labels and levels" to the :class:`~pandas.Categorical` constructor is deprecated and does - not work without supplying ``compat=True``. The default mode now uses "values and levels". - Please change your code to use the :meth:`~pandas.Categorical.from_codes` constructor. +- Supplying "codes/labels and levels" to the :class:`~pandas.Categorical` constructor is not + supported anymore. Supplying two arguments to the constructor is now interpreted as + "values and levels". Please change your code to use the :meth:`~pandas.Categorical.from_codes` + constructor. - The ``Categorical.labels`` attribute was renamed to ``Categorical.codes`` and is read only. If you want to manipulate codes, please use one of the diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8d1b1588552bf..8c4f45fdeb57a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -213,7 +213,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.codes elif is_category: - bins = values.levels + bins = values.categories cat = values values = cat.codes @@ -248,11 +248,11 @@ def value_counts(values, sort=True, ascending=False, normalize=False, result = Series(counts, index=com._values_from_object(keys)) if bins is not None: # TODO: This next line should be more efficient - result = result.reindex(np.arange(len(cat.levels)), fill_value=0) + result = result.reindex(np.arange(len(cat.categories)), fill_value=0) if not is_category: result.index = bins[:-1] else: - result.index = cat.levels + result.index = cat.categories if sort: result.sort() diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1e9f7d69c9341..9ee0018500b00 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -22,16 +22,17 @@ def _cat_compare_op(op): def f(self, other): # On python2, you can usually compare any type to any type, and Categoricals can be - # seen as a custom type, but having different results depending whether a level are + # seen as a custom type, but having different results depending whether categories are # the same or not is kind of insane, so be a bit stricter here and use the python3 idea # of comparing only things of equal type. if not self.ordered: if op in ['__lt__', '__gt__','__le__','__ge__']: raise TypeError("Unordered Categoricals can only compare equality or not") if isinstance(other, Categorical): - # Two Categoricals can only be be compared if the levels are the same - if (len(self.levels) != len(other.levels)) or not ((self.levels == other.levels).all()): - raise TypeError("Categoricals can only be compared if 'levels' are the same") + # Two Categoricals can only be be compared if the categories are the same + if (len(self.categories) != len(other.categories)) or \ + not ((self.categories == other.categories).all()): + raise TypeError("Categoricals can only be compared if 'categories' are the same") if not (self.ordered == other.ordered): raise TypeError("Categoricals can only be compared if 'ordered' is the same") na_mask = (self._codes == -1) | (other._codes == -1) @@ -42,8 +43,8 @@ def f(self, other): ret[na_mask] = False return ret elif np.isscalar(other): - if other in self.levels: - i = self.levels.get_loc(other) + if other in self.categories: + i = self.categories.get_loc(other) return getattr(self._codes, op)(i) else: return np.repeat(False, len(self)) @@ -67,37 +68,39 @@ def _maybe_to_categorical(array): return array -_codes_doc = """The level codes of this categorical. +_codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real -values in the levels array. +values in the categories array. -There is not setter, used the other categorical methods and the item setter on -Categorical to change values in the categorical. +There is not setter, use the other categorical methods and the normal item setter to change +values in the categorical. """ -_levels_doc = """The levels of this categorical. +_categories_doc = """The categories of this categorical. -Setting assigns new values to each level (effectively a rename of -each individual level). +Setting assigns new values to each category (effectively a rename of +each individual category). -The assigned value has to be a list-like object. If the number of -level-items is less than number of level-items in the current level, -all level-items at a higher position are set to NaN. If the number of -level-items is more that the current number of level-items, new -(unused) levels are added at the end. +The assigned value has to be a list-like object. All items must be unique and the number of items +in the new categories must be the same as the number of items in the old categories. -To add level-items in between, use `reorder_levels`. +Assigning to `categories` is a inplace operation! Raises ------ ValueError - If the new levels do not validate as levels + If the new categories do not validate as categories or if the number of new categories is + unequal the number of old categories See also -------- -Categorical.reorder_levels -Categorical.remove_unused_levels +rename_categories +reorder_categories +add_categories +remove_categories +remove_unused_categories +set_categories """ class Categorical(PandasObject): @@ -105,21 +108,21 @@ class Categorical(PandasObject): Represents a categorical variable in classic R / S-plus fashion `Categoricals` can only take on only a limited, and usually fixed, number - of possible values (`levels`). In contrast to statistical categorical + of possible values (`categories`). In contrast to statistical categorical variables, a `Categorical` might have an order, but numerical operations (additions, divisions, ...) are not possible. - All values of the `Categorical` are either in `levels` or `np.nan`. - Assigning values outside of `levels` will raise a `ValueError`. Order is - defined by the order of the `levels`, not lexical order of the values. + All values of the `Categorical` are either in `categories` or `np.nan`. + Assigning values outside of `categories` will raise a `ValueError`. Order is + defined by the order of the `categories`, not lexical order of the values. Parameters ---------- values : list-like - The values of the categorical. If levels are given, values not in levels will + The values of the categorical. If categories are given, values not in categories will be replaced with NaN. - levels : Index-like (unique), optional - The unique levels for this categorical. If not given, the levels are assumed + categories : Index-like (unique), optional + The unique categories for this categorical. If not given, the categories are assumed to be the unique values of values. ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, @@ -127,48 +130,37 @@ class Categorical(PandasObject): name : str, optional Name for the Categorical variable. If name is None, will attempt to infer from values. - compat : boolean, default=False - Whether to treat values as codes to the levels (old API, deprecated) Attributes ---------- - levels : Index - The levels of this categorical + categories : Index + The categories of this categorical codes : ndarray - The codes (integer positions, which point to the levels) of this categorical, read only + The codes (integer positions, which point to the categories) of this categorical, read only. ordered : boolean - Whether or not this Categorical is ordered + Whether or not this Categorical is ordered. name : string - The name of this Categorical + The name of this Categorical. Raises ------ ValueError - If the levels do not validate + If the categories do not validate. TypeError - If an explicit ``ordered=True`` is given but no `levels` and the `values` are not sortable + If an explicit ``ordered=True`` is given but no `categories` and the `values` are + not sortable. Examples -------- >>> from pandas import Categorical >>> Categorical([1, 2, 3, 1, 2, 3]) - 1 - 2 - 3 - 1 - 2 - 3 - Levels (3): Int64Index([1, 2, 3], dtype=int64), ordered + [1, 2, 3, 1, 2, 3] + Categories (3, int64): [1 < 2 < 3] >>> Categorical(['a', 'b', 'c', 'a', 'b', 'c']) - a - b - c - a - b - c - Levels (3): Index(['a', 'b', 'c'], dtype=object), ordered + [a, b, c, a, b, c] + Categories (3, object): [a < b < c] >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a']) >>> a.min() @@ -184,7 +176,7 @@ class Categorical(PandasObject): """Whether or not this Categorical is ordered. Only ordered `Categoricals` can be sorted (according to the order - of the levels) and have a min and max value. + of the categories) and have a min and max value. See also -------- @@ -197,19 +189,30 @@ class Categorical(PandasObject): # For comparisons, so that numpy uses our implementation if the compare ops, which raise __array_priority__ = 1000 - def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): + def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, + levels=None): if fastpath: # fast path self._codes = values self.name = name - self.levels = levels + self.categories = categories self.ordered = ordered return if name is None: name = getattr(values, 'name', None) + # TODO: Remove after deprecation period in 2017/ after 0.18 + if not levels is None: + warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", + FutureWarning) + if categories is None: + categories = levels + else: + raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " + "use only 'categories'") + # sanitize input if com.is_categorical_dtype(values): @@ -217,8 +220,8 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, cat = values if isinstance(values, com.ABCSeries): cat = values.values - if levels is None: - levels = cat.levels + if categories is None: + categories = cat.categories if ordered is None: ordered = cat.ordered values = values.__array__() @@ -237,61 +240,58 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype - # to prevent this. In the end objects will be casted to int/... in the level + # to prevent this. In the end objects will be casted to int/... in the category # assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) - if levels is None: + if categories is None: try: - codes, levels = factorize(values, sort=True) + codes, categories = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: - codes, levels = factorize(values, sort=False) + codes, categories = factorize(values, sort=False) if ordered: - # raise, as we don't have a sortable data structure and so the usershould - # give us one by specifying levels - raise TypeError("'values' is not ordered, please explicitly specify the level " - "order by passing in a level argument.") + # raise, as we don't have a sortable data structure and so the user should + # give us one by specifying categories + raise TypeError("'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument.") else: - # there are two ways if levels are present - # the old one, where each value is a int pointer to the levels array - # the new one, where each value is also in the level array (or np.nan) + # there were two ways if categories are present + # - the old one, where each value is a int pointer to the levels array -> not anymore + # possible, but code outside of pandas could call us like that, so make some checks + # - the new one, where each value is also in the categories array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in - levels = self._validate_levels(levels) - - # There can be two ways: the old which passed in codes and levels directly - # and values have to be inferred and the new one, which passes in values and levels - # and _codes have to be inferred. - - # min and max can be higher and lower if not all levels are in the values - if compat and (com.is_integer_dtype(values) and - (np.min(values) >= -1) and (np.max(values) < len(levels))): - warn("Using 'values' as codes is deprecated.\n" - "'Categorical(... , compat=True)' is only there for historical reasons and " - "should not be used in new code!\n" - "See https://github.com/pydata/pandas/pull/7217", FutureWarning) - codes = values - else: - codes = _get_codes_for_values(values, levels) + categories = self._validate_categories(categories) - # if we got levels, we can assume that the order is intended - # if ordered is unspecified - if ordered is None: - ordered = True + codes = _get_codes_for_values(values, categories) + + # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 + if com.is_integer_dtype(values) and not com.is_integer_dtype(categories): + warn("Values and categories have different dtypes. Did you mean to use\n" + "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) + + if com.is_integer_dtype(values) and (codes == -1).all(): + warn("None of the categories were found in values. Did you mean to use\n" + "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) + + # if we got categories, we can assume that the order is intended + # if ordered is unspecified + if ordered is None: + ordered = True self.ordered = False if ordered is None else ordered self._codes = codes - self.levels = levels + self.categories = categories self.name = name def copy(self): """ Copy constructor. """ - return Categorical(values=self._codes.copy(),levels=self.levels, + return Categorical(values=self._codes.copy(),categories=self.categories, name=self.name, ordered=self.ordered, fastpath=True) @classmethod @@ -299,20 +299,22 @@ def from_array(cls, data): """ Make a Categorical type from a single array-like object. + For internal compatibility with numpy arrays. + Parameters ---------- data : array-like - Can be an Index or array-like. The levels are assumed to be + Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ return Categorical(data) @classmethod - def from_codes(cls, codes, levels, ordered=False, name=None): + def from_codes(cls, codes, categories, ordered=False, name=None): """ - Make a Categorical type from codes and levels arrays. + Make a Categorical type from codes and categories arrays. - This constructor is useful if you already have codes and levels and so do not need the + This constructor is useful if you already have codes and categories and so do not need the (computation intensive) factorization step, which is usually done on the constructor. If your data does not follow this convention, please use the normal constructor. @@ -320,9 +322,9 @@ def from_codes(cls, codes, levels, ordered=False, name=None): Parameters ---------- codes : array-like, integers - An integer array, where each integer points to a level in levels or -1 for NaN - levels : index-like - The levels for the categorical. Items need to be unique. + An integer array, where each integer points to a category in categories or -1 for NaN + categories : index-like + The categories for the categorical. Items need to be unique. ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will be unordered. @@ -334,18 +336,18 @@ def from_codes(cls, codes, levels, ordered=False, name=None): except: raise ValueError("codes need to be convertible to an arrays of integers") - levels = cls._validate_levels(levels) + categories = cls._validate_categories(categories) - if codes.max() >= len(levels) or codes.min() < -1: - raise ValueError("codes need to be between -1 and len(levels)-1") + if codes.max() >= len(categories) or codes.min() < -1: + raise ValueError("codes need to be between -1 and len(categories)-1") - return Categorical(codes, levels=levels, ordered=ordered, name=name, fastpath=True) + return Categorical(codes, categories=categories, ordered=ordered, name=name, fastpath=True) _codes = None def _get_codes(self): - """ Get the level codes. + """ Get the codes. Returns ------- @@ -365,7 +367,7 @@ def _set_codes(self, codes): codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) def _get_labels(self): - """ Get the level labels (deprecated). + """ Get the category labels (deprecated). Deprecated, use .codes! """ @@ -375,84 +377,309 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) - _levels = None + _categories = None @classmethod - def _validate_levels(cls, levels): - """" Validates that we have good levels """ - if not isinstance(levels, Index): + def _validate_categories(cls, categories): + """" Validates that we have good categories """ + if not isinstance(categories, Index): dtype = None - if not hasattr(levels, "dtype"): - levels = _convert_to_list_like(levels) - # on levels with NaNs, int values would be converted to float. Use "object" dtype - # to prevent this. - if isnull(levels).any(): - without_na = np.array([x for x in levels if com.notnull(x)]) - with_na = np.array(levels) + if not hasattr(categories, "dtype"): + categories = _convert_to_list_like(categories) + # on categories with NaNs, int values would be converted to float. + # Use "object" dtype to prevent this. + if isnull(categories).any(): + without_na = np.array([x for x in categories if com.notnull(x)]) + with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" - levels = Index(levels, dtype=dtype) - if not levels.is_unique: - raise ValueError('Categorical levels must be unique') - return levels + categories = Index(categories, dtype=dtype) + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + return categories + + def _set_categories(self, categories): + """ Sets new categories """ + categories = self._validate_categories(categories) + if not self._categories is None and len(categories) != len(self._categories): + raise ValueError("new categories need to have the same number of items than the old " + "categories!") + self._categories = categories + + def _get_categories(self): + """ Gets the categories """ + # categories is an Index, which is immutable -> no need to copy + return self._categories + + categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) def _set_levels(self, levels): - """ Sets new levels """ - levels = self._validate_levels(levels) - - if not self._levels is None and len(levels) < len(self._levels): - # remove all _codes which are larger - self._codes[self._codes >= len(levels)] = -1 - self._levels = levels + """ set new levels (deprecated, use "categories") """ + warn("Assigning to 'levels' is deprecated, use 'categories'", FutureWarning) + self.categories = levels def _get_levels(self): - """ Gets the levels """ - # levels is an Index, which is immutable -> no need to copy - return self._levels + """ Gets the levels (deprecated, use "categories") """ + warn("Accessing 'levels' is deprecated, use 'categories'", FutureWarning) + return self.categories - levels = property(fget=_get_levels, fset=_set_levels, doc=_levels_doc) + # TODO: Remove after deprecation period in 2017/ after 0.18 + levels = property(fget=_get_levels, fset=_set_levels) - def reorder_levels(self, new_levels, ordered=None): - """ Reorders levels as specified in new_levels. + def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): + """ Sets the categories to the specified new_categories. - `new_levels` must include all old levels but can also include new level items. In - contrast to assigning to `levels`, these new level items can be in arbitrary positions. + `new_categories` can include new categories (which will result in unused categories) or + or remove old categories (which results in values set to NaN). If `rename==True`, + the categories will simple be renamed (less or more items than in old categories will + result in values set to NaN or in unused categories respectively). - The level reordering is done inplace. + This method can be used to perform more than one action of adding, removing, + and reordering simultaneously and is therefore faster than performing the individual steps + via the more specialised methods. + + On the other hand this methods does not do checks (e.g., whether the old categories are + included in the new categories on a reorder), which can result in surprising changes, for + example when using special string dtypes on python3, which does not considers a S1 string + equal to a single char python string. Raises ------ ValueError - If the new levels do not contain all old level items + If new_categories does not validate as categories Parameters ---------- - new_levels : Index-like - The levels in new order. must be of same length as the old levels + new_categories : Index-like + The categories in new order. ordered : boolean, optional Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. + rename : boolean (default: False) + Whether or not the new_categories should be considered as a rename of the old + categories or as reordered categories. + inplace : boolean (default: False) + Whether or not to reorder the categories inplace or return a copy of this categorical + with reordered categories. + + Returns + ------- + cat : Categorical with reordered categories or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories """ - new_levels = self._validate_levels(new_levels) + new_categories = self._validate_categories(new_categories) + cat = self if inplace else self.copy() + if rename: + if not cat._categories is None and len(new_categories) < len(cat._categories): + # remove all _codes which are larger and set to -1/NaN + self._codes[self._codes >= len(new_categories)] = -1 + cat._categories = new_categories + else: + values = cat.__array__() + cat._codes = _get_codes_for_values(values, new_categories) + cat._categories = new_categories - if len(new_levels) < len(self._levels) or len(self._levels.difference(new_levels)): - raise ValueError('Reordered levels must include all original levels') - values = self.__array__() - self._codes = _get_codes_for_values(values, new_levels) - self._levels = new_levels if not ordered is None: - self.ordered = ordered + cat.ordered = ordered + + if not inplace: + return cat + + def rename_categories(self, new_categories, inplace=False): + """ Renames categories. + + The new categories has to be a list-like object. All items must be unique and the number of + items in the new categories must be the same as the number of items in the old categories. + + Raises + ------ + ValueError + If the new categories do not have the same number of items than the current categories + or do not validate as categories + + Parameters + ---------- + new_categories : Index-like + The renamed categories. + inplace : boolean (default: False) + Whether or not to rename the categories inplace or return a copy of this categorical + with renamed categories. + + Returns + ------- + cat : Categorical with renamed categories added or None if inplace. + + See also + -------- + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + cat = self if inplace else self.copy() + cat.categories = new_categories + if not inplace: + return cat + + def reorder_categories(self, new_categories, ordered=None, inplace=False): + """ Reorders categories as specified in new_categories. + + `new_categories` need to include all old categories and no new category items. + + Raises + ------ + ValueError + If the new categories do not contain all old category items or any new ones + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : boolean, optional + Whether or not the categorical is treated as a ordered categorical. If not given, + do not change the ordered information. + inplace : boolean (default: False) + Whether or not to reorder the categories inplace or return a copy of this categorical + with reordered categories. + + Returns + ------- + cat : Categorical with reordered categories or None if inplace. + + See also + -------- + rename_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + if set(self._categories) != set(new_categories): + raise ValueError("items in new_categories are not the same as in old categories") + return self.set_categories(new_categories, ordered=ordered, inplace=inplace) - def remove_unused_levels(self): - """ Removes levels which are not used. + def add_categories(self, new_categories, inplace=False): + """ Add new categories. - The level removal is done inplace. + `new_categories` will be included at the last/highest place in the categories and will be + unused directly after this call. + + Raises + ------ + ValueError + If the new categories include old categories or do not validate as categories + + Parameters + ---------- + new_categories : category or list-like of category + The new categories to be included. + inplace : boolean (default: False) + Whether or not to add the categories inplace or return a copy of this categorical + with added categories. + + Returns + ------- + cat : Categorical with new categories added or None if inplace. + + See also + -------- + rename_categories + reorder_categories + remove_categories + remove_unused_categories + set_categories """ - _used = sorted(np.unique(self._codes)) - new_levels = self.levels.take(com._ensure_platform_int(_used)) - new_levels = _ensure_index(new_levels) - self._codes = _get_codes_for_values(self.__array__(), new_levels) - self._levels = new_levels + if not com.is_list_like(new_categories): + new_categories = [new_categories] + already_included = set(new_categories) & set(self._categories) + if len(already_included) != 0: + msg = "new categories must not include old categories: %s" % str(already_included) + raise ValueError(msg) + new_categories = list(self._categories) + (new_categories) + new_categories = self._validate_categories(new_categories) + cat = self if inplace else self.copy() + cat._categories = new_categories + if not inplace: + return cat + + def remove_categories(self, removals, inplace=False): + """ Removes the specified categories. + + `removals` must be included in the old categories. Values which were in the removed + categories will be set to NaN + + Raises + ------ + ValueError + If the removals are not contained in the categories + + Parameters + ---------- + removals : category or list of categories + The categories which should be removed. + inplace : boolean (default: False) + Whether or not to remove the categories inplace or return a copy of this categorical + with removed categories. + + Returns + ------- + cat : Categorical with removed categories or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_unused_categories + set_categories + """ + if not com.is_list_like(removals): + removals = [removals] + not_included = set(removals) - set(self._categories) + if len(not_included) != 0: + raise ValueError("removals must all be in old categories: %s" % str(not_included)) + new_categories = set(self._categories) - set(removals) + return self.set_categories(new_categories, ordered=self.ordered, rename=False, + inplace=inplace) + + + def remove_unused_categories(self, inplace=False): + """ Removes categories which are not used. + + Parameters + ---------- + inplace : boolean (default: False) + Whether or not to drop unused categories inplace or return a copy of this categorical + with unused categories dropped. + + Returns + ------- + cat : Categorical with unused categories dropped or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + set_categories + """ + cat = self if inplace else self.copy() + _used = sorted(np.unique(cat._codes)) + new_categories = cat.categories.take(com._ensure_platform_int(_used)) + new_categories = _ensure_index(new_categories) + cat._codes = _get_codes_for_values(cat.__array__(), new_categories) + cat._categories = new_categories + if not inplace: + return cat __eq__ = _cat_compare_op('__eq__') @@ -483,10 +710,10 @@ def __array__(self, dtype=None): ------- values : numpy array A numpy array of either the specified dtype or, if dtype==None (default), the same - dtype as categorical.levels.dtype + dtype as categorical.categories.dtype """ - ret = com.take_1d(self.levels.values, self._codes) - if dtype and dtype != self.levels.dtype: + ret = com.take_1d(self.categories.values, self._codes) + if dtype and dtype != self.categories.dtype: return np.asarray(ret, dtype) return ret @@ -498,7 +725,7 @@ def isnull(self): """ Detect missing values - Both missing values (-1 in .codes) and NA as a level are detected. + Both missing values (-1 in .codes) and NA as a category are detected. Returns ------- @@ -512,11 +739,11 @@ def isnull(self): ret = self._codes == -1 - # String/object and float levels can hold np.nan - if self.levels.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.levels: - nan_pos = np.where(isnull(self.levels))[0] - # we only have one NA in levels + # String/object and float categories can hold np.nan + if self.categories.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.categories: + nan_pos = np.where(isnull(self.categories))[0] + # we only have one NA in categories ret = np.logical_or(ret , self._codes == nan_pos) return ret @@ -524,7 +751,7 @@ def notnull(self): """ Reverse of isnull - Both missing values (-1 in .codes) and NA as a level are detected as null. + Both missing values (-1 in .codes) and NA as a category are detected as null. Returns ------- @@ -545,12 +772,13 @@ def get_values(self): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.levels.dtype or dtype string if periods + A numpy array of the same dtype as categorical.categories.dtype or dtype string if + periods """ # if we are a period index, return a string repr - if isinstance(self.levels, PeriodIndex): - return com.take_1d(np.array(self.levels.to_native_types(), dtype=object), + if isinstance(self.categories, PeriodIndex): + return com.take_1d(np.array(self.categories.to_native_types(), dtype=object), self._codes) return np.array(self) @@ -574,7 +802,7 @@ def argsort(self, ascending=True, **kwargs): return result def order(self, inplace=False, ascending=True, na_position='last', **kwargs): - """ Sorts the Category by level value returning a new Categorical by default. + """ Sorts the Category by category value returning a new Categorical by default. Only ordered Categoricals can be sorted! @@ -628,12 +856,12 @@ def order(self, inplace=False, ascending=True, na_position='last', **kwargs): self._codes = codes return else: - return Categorical(values=codes,levels=self.levels, ordered=self.ordered, + return Categorical(values=codes,categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) def sort(self, inplace=True, ascending=True, na_position='last', **kwargs): - """ Sorts the Category inplace by level value. + """ Sorts the Category inplace by category value. Only ordered Categoricals can be sorted! @@ -683,7 +911,14 @@ def view(self): return self def to_dense(self): - """ Return my 'dense' repr """ + """Return my 'dense' representation + + For internal compatibility with numpy arrays. + + Returns + ------- + dense : array + """ return np.asarray(self) def fillna(self, fill_value=None, method=None, limit=None, **kwargs): @@ -712,12 +947,12 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs): values = self._codes - # Make sure that we also get NA in levels - if self.levels.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.levels: + # Make sure that we also get NA in categories + if self.categories.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.categories: values = values.copy() - nan_pos = np.where(isnull(self.levels))[0] - # we only have one NA in levels + nan_pos = np.where(isnull(self.categories))[0] + # we only have one NA in categories values[values == nan_pos] = -1 @@ -726,38 +961,44 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs): values = self.to_dense().reshape(-1,len(self)) values = com.interpolate_2d( - values, method, 0, None, fill_value).astype(self.levels.dtype)[0] - values = _get_codes_for_values(values, self.levels) + values, method, 0, None, fill_value).astype(self.categories.dtype)[0] + values = _get_codes_for_values(values, self.categories) else: - if not com.isnull(fill_value) and fill_value not in self.levels: - raise ValueError("fill value must be in levels") + if not com.isnull(fill_value) and fill_value not in self.categories: + raise ValueError("fill value must be in categories") mask = values==-1 if mask.any(): values = values.copy() - values[mask] = self.levels.get_loc(fill_value) + values[mask] = self.categories.get_loc(fill_value) - return Categorical(values, levels=self.levels, ordered=self.ordered, + return Categorical(values, categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): - """ Take the codes by the indexer, fill with the fill_value. """ + """ Take the codes by the indexer, fill with the fill_value. + + For internal compatibility with numpy arrays. + """ # filling must always be None/nan here # but is passed thru internally assert isnull(fill_value) codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) - result = Categorical(codes, levels=self.levels, ordered=self.ordered, + result = Categorical(codes, categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) return result take = take_nd def _slice(self, slicer): - """ Return a slice of myself. """ + """ Return a slice of myself. + + For internal compatibility with numpy arrays. + """ # only allow 1 dimensional slicing, but can # in a 2-d case be passd (slice(None),....) @@ -767,42 +1008,44 @@ def _slice(self, slicer): slicer = slicer[1] _codes = self._codes[slicer] - return Categorical(values=_codes,levels=self.levels, ordered=self.ordered, + return Categorical(values=_codes,categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) def __len__(self): + """The length of this Categorical.""" return len(self._codes) def __iter__(self): + """Returns an Iterator over the values of this Categorical.""" return iter(np.array(self)) - def _tidy_repr(self, max_vals=20): + def _tidy_repr(self, max_vals=10): num = max_vals // 2 head = self[:num]._get_repr(length=False, name=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, name=False, footer=False) - result = '%s\n...\n%s' % (head, tail) + result = '%s, ..., %s' % (head[:-1], tail[1:]) result = '%s\n%s' % (result, self._repr_footer()) return compat.text_type(result) - def _repr_level_info(self): + def _repr_categories_info(self): """ Returns a string representation of the footer.""" - max_levels = (10 if get_option("display.max_levels") == 0 - else get_option("display.max_levels")) - level_strs = fmt.format_array(self.levels.get_values(), None) - if len(level_strs) > max_levels: - num = max_levels // 2 - head = level_strs[:num] - tail = level_strs[-(max_levels - num):] - level_strs = head + ["..."] + tail + max_categories = (10 if get_option("display.max_categories") == 0 + else get_option("display.max_categories")) + category_strs = fmt.format_array(self.categories.get_values(), None) + if len(category_strs) > max_categories: + num = max_categories // 2 + head = category_strs[:num] + tail = category_strs[-(max_categories - num):] + category_strs = head + ["..."] + tail # Strip all leading spaces, which format_array adds for columns... - level_strs = [x.strip() for x in level_strs] - levheader = "Levels (%d, %s): " % (len(self.levels), - self.levels.dtype) + category_strs = [x.strip() for x in category_strs] + levheader = "Categories (%d, %s): " % (len(self.categories), + self.categories.dtype) width, height = get_terminal_size() max_width = (width if get_option("display.width") == 0 else get_option("display.width")) @@ -813,7 +1056,7 @@ def _repr_level_info(self): start = True cur_col_len = len(levheader) sep_len, sep = (3, " < ") if self.ordered else (2, ", ") - for val in level_strs: + for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: levstring += "\n" + (" "* len(levheader)) cur_col_len = len(levheader) @@ -829,7 +1072,7 @@ def _repr_footer(self): namestr = "Name: %s, " % self.name if self.name is not None else "" return u('%sLength: %d\n%s') % (namestr, - len(self), self._repr_level_info()) + len(self), self._repr_categories_info()) def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): formatter = fmt.CategoricalFormatter(self, name=name, @@ -840,17 +1083,14 @@ def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): def __unicode__(self): """ Unicode representation. """ - width, height = get_terminal_size() - max_rows = (height if get_option("display.max_rows") == 0 - else get_option("display.max_rows")) - - if len(self._codes) > (max_rows or 1000): - result = self._tidy_repr(min(30, max_rows) - 4) + _maxlen = 10 + if len(self._codes) > _maxlen: + result = self._tidy_repr(_maxlen) elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > 50, + result = self._get_repr(length=len(self) > _maxlen, name=True) else: - result = 'Categorical([], %s' % self._get_repr(name=True, + result = '[], %s' % self._get_repr(name=True, length=False, footer=True, ).replace("\n",", ") @@ -864,9 +1104,9 @@ def __getitem__(self, key): if i == -1: return np.nan else: - return self.levels[i] + return self.categories[i] else: - return Categorical(values=self._codes[key], levels=self.levels, + return Categorical(values=self._codes[key], categories=self.categories, ordered=self.ordered, fastpath=True) def __setitem__(self, key, value): @@ -876,22 +1116,23 @@ def __setitem__(self, key, value): Raises ------ ValueError - If (one or more) Value is not in levels or if a assigned `Categorical` has not the - same levels + If (one or more) Value is not in categories or if a assigned `Categorical` has not the + same categories """ - # require identical level set + # require identical categories set if isinstance(value, Categorical): - if not value.levels.equals(self.levels): - raise ValueError("cannot set a Categorical with another, without identical levels") + if not value.categories.equals(self.categories): + raise ValueError("Cannot set a Categorical with another, without identical " + "categories") rvalue = value if com.is_list_like(value) else [value] - to_add = Index(rvalue).difference(self.levels) - # no assignments of values not in levels, but it's always ok to set something to np.nan + to_add = Index(rvalue).difference(self.categories) + # no assignments of values not in categories, but it's always ok to set something to np.nan if len(to_add) and not isnull(to_add).all(): - raise ValueError("cannot setitem on a Categorical with a new level," - " set the levels first") + raise ValueError("cannot setitem on a Categorical with a new category," + " set the categories first") # set by position if isinstance(key, (int, np.integer)): @@ -922,14 +1163,14 @@ def __setitem__(self, key, value): # FIXME: remove when numpy 1.9 is the lowest numpy version pandas accepts... key = np.asarray(key) - lindexer = self.levels.get_indexer(rvalue) + lindexer = self.categories.get_indexer(rvalue) # FIXME: the following can be removed after https://github.com/pydata/pandas/issues/7820 # is fixed. - # float levels do currently return -1 for np.nan, even if np.nan is included in the index - # "repair" this here - if isnull(rvalue).any() and isnull(self.levels).any(): - nan_pos = np.where(com.isnull(self.levels))[0] + # float categories do currently return -1 for np.nan, even if np.nan is included in the + # index -> "repair" this here + if isnull(rvalue).any() and isnull(self.categories).any(): + nan_pos = np.where(com.isnull(self.categories))[0] lindexer[lindexer == -1] = nan_pos self._codes[key] = lindexer @@ -967,7 +1208,7 @@ def min(self, numeric_only=None, **kwargs): if pointer == -1: return np.nan else: - return self.levels[pointer] + return self.categories[pointer] def max(self, numeric_only=None, **kwargs): @@ -994,7 +1235,7 @@ def max(self, numeric_only=None, **kwargs): if pointer == -1: return np.nan else: - return self.levels[pointer] + return self.categories[pointer] def mode(self): """ @@ -1011,7 +1252,7 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))), - levels=self.levels,ordered=self.ordered, name=self.name, + categories=self.categories,ordered=self.ordered, name=self.name, fastpath=True) return result @@ -1019,13 +1260,13 @@ def unique(self): """ Return the unique values. - This includes all levels, even if one or more is unused. + This includes all categories, even if one or more is unused. Returns ------- unique values : array """ - return self.levels + return np.asarray(self.categories) def equals(self, other): """ @@ -1044,7 +1285,7 @@ def equals(self, other): if not isinstance(other, Categorical): return False # TODO: should this also test if name is equal? - return (self.levels.equals(other.levels) and self.ordered == other.ordered and + return (self.categories.equals(other.categories) and self.ordered == other.ordered and np.array_equal(self._codes, other._codes)) def describe(self): @@ -1053,7 +1294,7 @@ def describe(self): Returns ------- description: `DataFrame` - A dataframe with frequency and counts by level. + A dataframe with frequency and counts by category. """ # Hack? from pandas.core.frame import DataFrame @@ -1068,38 +1309,45 @@ def describe(self): result = concat([counts,freqs],axis=1) result.columns = ['counts','freqs'] - # fill in the real levels + # fill in the real categories check = result.index == -1 if check.any(): # Sort -1 (=NaN) to the last position - index = np.arange(0, len(self.levels)+1, dtype='int64') + index = np.arange(0, len(self.categories)+1, dtype='int64') index[-1] = -1 result = result.reindex(index) # build new index - levels = np.arange(0,len(self.levels)+1 ,dtype=object) - levels[:-1] = self.levels - levels[-1] = np.nan - result.index = levels.take(com._ensure_platform_int(result.index)) + categories = np.arange(0,len(self.categories)+1 ,dtype=object) + categories[:-1] = self.categories + categories[-1] = np.nan + result.index = categories.take(com._ensure_platform_int(result.index)) else: - result.index = self.levels.take(com._ensure_platform_int(result.index)) - result = result.reindex(self.levels) - result.index.name = 'levels' + result.index = self.categories.take(com._ensure_platform_int(result.index)) + result = result.reindex(self.categories) + result.index.name = 'categories' return result ##### The Series.cat accessor ##### -class CategoricalProperties(PandasDelegate): +class CategoricalAccessor(PandasDelegate): """ Accessor object for categorical properties of the Series values. + Be aware that assigning to `categories` is a inplace operation, while all methods return + new categorical data per default (but can be called with `inplace=True`). + Examples -------- - >>> s.cat.levels - >>> s.cat.levels = list('abc') - >>> s.cat.reorder_levels(list('cab')) + >>> s.cat.categories + >>> s.cat.categories = list('abc') + >>> s.cat.rename_categories(list('cab')) + >>> s.cat.reorder_categories(list('cab')) + >>> s.cat.add_categories(['d','e']) + >>> s.cat.remove_categories(['d']) + >>> s.cat.remove_unused_categories() + >>> s.cat.set_categories(list('abcde')) - Allows accessing to specific getter and access methods """ def __init__(self, values, index): @@ -1112,31 +1360,45 @@ def _delegate_property_get(self, name): def _delegate_property_set(self, name, new_values): return setattr(self.categorical, name, new_values) + @property + def codes(self): + from pandas import Series + return Series(self.categorical.codes, index=self.index) + def _delegate_method(self, name, *args, **kwargs): + from pandas import Series method = getattr(self.categorical, name) - return method(*args, **kwargs) - -CategoricalProperties._add_delegate_accessors(delegate=Categorical, - accessors=["levels", "ordered"], - typ='property') -CategoricalProperties._add_delegate_accessors(delegate=Categorical, - accessors=["reorder_levels", "remove_unused_levels"], - typ='method') + res = method(*args, **kwargs) + if not res is None: + return Series(res, index=self.index) + +# TODO: remove levels after the deprecation period +CategoricalAccessor._add_delegate_accessors(delegate=Categorical, + accessors=["categories", "ordered"], + typ='property') +CategoricalAccessor._add_delegate_accessors(delegate=Categorical, + accessors=["rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories"], + typ='method') ##### utility routines ##### -def _get_codes_for_values(values, levels): +def _get_codes_for_values(values, categories): """" - utility routine to turn values into codes given the specified levels + utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables - if values.dtype != levels.dtype: + if values.dtype != categories.dtype: values = com._ensure_object(values) - levels = com._ensure_object(levels) + categories = com._ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - t = hash_klass(len(levels)) - t.map_locations(com._values_from_object(levels)) + t = hash_klass(len(categories)) + t.map_locations(com._values_from_object(categories)) return com._ensure_platform_int(t.lookup(values)) def _convert_to_list_like(list_like): diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 8379266533c86..1d93b9d5e69c1 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -59,10 +59,10 @@ correct auto-detection. """ -pc_max_levels_doc = """ +pc_max_categories_doc = """ : int - This sets the maximum number of levels pandas should output when printing - out a `Categorical`. + This sets the maximum number of categories pandas should output when printing + out a `Categorical` or a Series of dtype "category". """ pc_max_info_cols_doc = """ @@ -237,7 +237,7 @@ def mpl_style_cb(key): validator=is_instance_factory((int, type(None)))) cf.register_option('max_rows', 60, pc_max_rows_doc, validator=is_instance_factory([type(None), int])) - cf.register_option('max_levels', 8, pc_max_levels_doc, validator=is_int) + cf.register_option('max_categories', 8, pc_max_categories_doc, validator=is_int) cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int) cf.register_option('max_columns', 20, pc_max_cols_doc, validator=is_instance_factory([type(None), int])) diff --git a/pandas/core/format.py b/pandas/core/format.py index fe5cbb7337aec..2773cc0c135c1 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -90,7 +90,7 @@ def _get_footer(self): footer += ', ' footer += "Length: %d" % len(self.categorical) - level_info = self.categorical._repr_level_info() + level_info = self.categorical._repr_categories_info() # Levels are added in a newline if footer: @@ -116,6 +116,9 @@ def to_string(self): fmt_values = self._get_formatted_values() result = ['%s' % i for i in fmt_values] + result = [i.strip() for i in result] + result = u(', ').join(result) + result = [u('[')+result+u(']')] if self.footer: footer = self._get_footer() if footer: @@ -173,7 +176,7 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done for Categoricals # Only added when we request a name if self.name and com.is_categorical_dtype(self.series.dtype): - level_info = self.series.values._repr_level_info() + level_info = self.series.values._repr_categories_info() if footer: footer += "\n" footer += level_info diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fb538e7cbe44d..db04f4933d5aa 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1924,7 +1924,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = np.asarray(factor) self._labels = factor.codes - self._group_index = factor.levels + self._group_index = factor.categories if self.name is None: self.name = factor.name @@ -3545,7 +3545,7 @@ def _lexsort_indexer(keys, orders=None, na_position='last'): if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) - n = len(c.levels) + n = len(c.categories) codes = c.codes.copy() mask = (c.codes == -1) diff --git a/pandas/core/index.py b/pandas/core/index.py index 23f4cfd442a59..b58546da086b7 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3206,7 +3206,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): return Index(arrays[0], name=name) cats = [Categorical.from_array(arr) for arr in arrays] - levels = [c.levels for c in cats] + levels = [c.categories for c in cats] labels = [c.codes for c in cats] if names is None: names = [c.name for c in cats] @@ -3301,7 +3301,7 @@ def from_product(cls, iterables, sortorder=None, names=None): categoricals = [Categorical.from_array(it) for it in iterables] labels = cartesian_product([c.codes for c in categoricals]) - return MultiIndex(levels=[c.levels for c in categoricals], + return MultiIndex(levels=[c.categories for c in categoricals], labels=labels, sortorder=sortorder, names=names) @property diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 053b92b2ad547..0055947c59210 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1698,12 +1698,12 @@ def _concat_blocks(self, blocks, values): return the block concatenation """ - levels = self.values.levels + categories = self.values.categories for b in blocks: - if not levels.equals(b.values.levels): + if not categories.equals(b.values.categories): raise ValueError("incompatible levels in categorical block merge") - return self._holder(values[0], levels=levels) + return self._holder(values[0], categories=categories) def to_native_types(self, slicer=None, na_rep='', **kwargs): """ convert to our native types format, slicing if desired """ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 95d279add172c..686a0c4f6cca4 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -99,7 +99,7 @@ def panel_index(time, panels, names=['time', 'panel']): panel_factor = Categorical.from_array(panels) labels = [time_factor.codes, panel_factor.codes] - levels = [time_factor.levels, panel_factor.levels] + levels = [time_factor.categories, panel_factor.categories] return MultiIndex(levels, labels, sortorder=None, names=names, verify_integrity=False) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 458f672530270..bb6f6f4d00cd8 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1113,7 +1113,7 @@ def check_len(item, name): def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) - levels = cat.levels + levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: @@ -1130,7 +1130,7 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) if dummy_na: - levels = np.append(cat.levels, np.nan) + levels = np.append(cat.categories, np.nan) else: # reset NaN GH4446 dummy_mat[cat.codes == -1] = 0 @@ -1182,7 +1182,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): mapped_items = items.map(transform) cat = Categorical.from_array(mapped_items.take(labels)) labels = cat.codes - items = cat.levels + items = cat.categories values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) diff --git a/pandas/core/series.py b/pandas/core/series.py index 078bf0def241e..f77d3a60adee2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -922,7 +922,7 @@ def _repr_footer(self): # Categorical if com.is_categorical_dtype(self.dtype): - level_info = self.values._repr_level_info() + level_info = self.values._repr_categories_info() return u('%sLength: %d, dtype: %s\n%s') % (namestr, len(self), str(self.dtype.name), @@ -2442,10 +2442,10 @@ def dt(self): @cache_readonly def cat(self): - from pandas.core.categorical import CategoricalProperties + from pandas.core.categorical import CategoricalAccessor if not com.is_categorical_dtype(self.dtype): raise TypeError("Can only use .cat accessor with a 'category' dtype") - return CategoricalProperties(self.values, self.index) + return CategoricalAccessor(self.values, self.index) Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5a68cb16f058f..50b279dbe675f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3522,8 +3522,8 @@ def read(self, where=None, columns=None, **kwargs): return None factors = [Categorical.from_array(a.values) for a in self.index_axes] - levels = [f.levels for f in factors] - N = [len(f.levels) for f in factors] + levels = [f.categories for f in factors] + N = [len(f.categories) for f in factors] labels = [f.codes for f in factors] # compute the key diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 4f72c0d1c6cbe..047197124ee85 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4541,7 +4541,7 @@ def test_categorical(self): with ensure_clean_store(self.path) as store: - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], levels=['a','b','c','d'])) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'])) self.assertRaises(NotImplementedError, store.put, 's_fixed', s, format='fixed') self.assertRaises(NotImplementedError, store.append, 's_table', s, format='table') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 403f2e9329f95..edf18edd6a20d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -21,6 +21,16 @@ def setUp(self): self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + def assert_categorical_equal(self, res, exp): + if not com.array_equivalent(res.categories, exp.categories): + raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories, + exp.categories)) + if not com.array_equivalent(res.codes, exp.codes): + raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes, + exp.codes)) + self.assertEqual(res.ordered, exp.ordered, "ordered not the same") + self.assertEqual(res.name, exp.name, "name not the same") + def test_getitem(self): self.assertEqual(self.factor[0], 'a') self.assertEqual(self.factor[-1], 'c') @@ -39,39 +49,16 @@ def test_constructor_unsortable(self): self.assertFalse(factor.ordered) def test_constructor(self): - # There are multiple ways to call a constructor - # old style: two arrays, one a pointer to the labels - # old style is now only available with compat=True exp_arr = np.array(["a", "b", "c", "a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning): - c_old = Categorical([0,1,2,0,1,2], levels=["a","b","c"], compat=True) - self.assert_numpy_array_equal(c_old.__array__(), exp_arr) - # the next one are from the old docs - with tm.assert_produces_warning(FutureWarning): - c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3], compat=True) - self.assert_numpy_array_equal(c_old2.__array__(), np.array([1, 2, 3, 1, 2, 3])) - with tm.assert_produces_warning(FutureWarning): - c_old3 = Categorical([0,1,2,0,1,2], ['a', 'b', 'c'], compat=True) - self.assert_numpy_array_equal(c_old3.__array__(), np.array(['a', 'b', 'c', 'a', 'b', 'c'])) - - with tm.assert_produces_warning(FutureWarning): - cat = pd.Categorical([1,2], levels=[1,2,3], compat=True) - self.assert_numpy_array_equal(cat.__array__(), np.array([2,3])) - - with tm.assert_produces_warning(None): - cat = pd.Categorical([1,2], levels=[1,2,3], compat=False) - self.assert_numpy_array_equal(cat.__array__(), np.array([1,2])) - - # new style c1 = Categorical(exp_arr) self.assert_numpy_array_equal(c1.__array__(), exp_arr) - c2 = Categorical(exp_arr, levels=["a","b","c"]) + c2 = Categorical(exp_arr, categories=["a","b","c"]) self.assert_numpy_array_equal(c2.__array__(), exp_arr) - c2 = Categorical(exp_arr, levels=["c","b","a"]) + c2 = Categorical(exp_arr, categories=["c","b","a"]) self.assert_numpy_array_equal(c2.__array__(), exp_arr) - # levels must be unique + # categories must be unique def f(): Categorical([1,2], [1,2,2]) self.assertRaises(ValueError, f) @@ -88,25 +75,25 @@ def f(): c2 = Categorical(c1) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","b","c","d"]) c2 = Categorical(c1) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","c","b"]) c2 = Categorical(c1) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) - c2 = Categorical(c1, levels=["a","b","c"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","c","b"]) + c2 = Categorical(c1, categories=["a","b","c"]) self.assert_numpy_array_equal(c1.__array__(), c2.__array__()) - self.assert_numpy_array_equal(c2.levels, np.array(["a","b","c"])) + self.assert_numpy_array_equal(c2.categories, np.array(["a","b","c"])) # Series of dtype category - c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","b","c","d"]) c2 = Categorical(Series(c1)) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","c","b"]) c2 = Categorical(Series(c1)) self.assertTrue(c1.equals(c2)) @@ -115,65 +102,80 @@ def f(): c2 = Categorical(Series(["a", "b", "c", "a"])) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) - c2 = Categorical(Series(["a", "b", "c", "a"]), levels=["a","b","c","d"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","b","c","d"]) + c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a","b","c","d"]) self.assertTrue(c1.equals(c2)) - # This should result in integer levels, not float! - cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) - self.assertTrue(com.is_integer_dtype(cat.levels)) + # This should result in integer categories, not float! + cat = pd.Categorical([1,2,3,np.nan], categories=[1,2,3]) + self.assertTrue(com.is_integer_dtype(cat.categories)) # https://github.com/pydata/pandas/issues/3678 cat = pd.Categorical([np.nan,1, 2, 3]) - self.assertTrue(com.is_integer_dtype(cat.levels)) + self.assertTrue(com.is_integer_dtype(cat.categories)) # this should result in floats cat = pd.Categorical([np.nan, 1, 2., 3 ]) - self.assertTrue(com.is_float_dtype(cat.levels)) + self.assertTrue(com.is_float_dtype(cat.categories)) cat = pd.Categorical([np.nan, 1., 2., 3. ]) - self.assertTrue(com.is_float_dtype(cat.levels)) + self.assertTrue(com.is_float_dtype(cat.categories)) - # preserve int as far as possible by converting to object if NaN is in levels - cat = pd.Categorical([np.nan, 1, 2, 3], levels=[np.nan, 1, 2, 3]) - self.assertTrue(com.is_object_dtype(cat.levels)) + # preserve int as far as possible by converting to object if NaN is in categories + cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3]) + self.assertTrue(com.is_object_dtype(cat.categories)) # This doesn't work -> this would probably need some kind of "remember the original type" # feature to try to cast the array interface result to... #vals = np.asarray(cat[cat.notnull()]) #self.assertTrue(com.is_integer_dtype(vals)) - cat = pd.Categorical([np.nan,"a", "b", "c"], levels=[np.nan,"a", "b", "c"]) - self.assertTrue(com.is_object_dtype(cat.levels)) + cat = pd.Categorical([np.nan,"a", "b", "c"], categories=[np.nan,"a", "b", "c"]) + self.assertTrue(com.is_object_dtype(cat.categories)) # but don't do it for floats - cat = pd.Categorical([np.nan, 1., 2., 3.], levels=[np.nan, 1., 2., 3.]) - self.assertTrue(com.is_float_dtype(cat.levels)) + cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.]) + self.assertTrue(com.is_float_dtype(cat.categories)) # corner cases cat = pd.Categorical([1]) - self.assertTrue(len(cat.levels) == 1) - self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == 1) self.assertTrue(len(cat.codes) == 1) self.assertTrue(cat.codes[0] == 0) cat = pd.Categorical(["a"]) - self.assertTrue(len(cat.levels) == 1) - self.assertTrue(cat.levels[0] == "a") + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == "a") self.assertTrue(len(cat.codes) == 1) self.assertTrue(cat.codes[0] == 0) # Scalars should be converted to lists cat = pd.Categorical(1) - self.assertTrue(len(cat.levels) == 1) - self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == 1) self.assertTrue(len(cat.codes) == 1) self.assertTrue(cat.codes[0] == 0) - cat = pd.Categorical([1], levels=1) - self.assertTrue(len(cat.levels) == 1) - self.assertTrue(cat.levels[0] == 1) + cat = pd.Categorical([1], categories=1) + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == 1) self.assertTrue(len(cat.codes) == 1) self.assertTrue(cat.codes[0] == 0) + # Catch old style constructor useage: two arrays, codes + categories + # We can only catch two cases: + # - when the first is an integer dtype and the second is not + # - when the resulting codes are all -1/NaN + with tm.assert_produces_warning(RuntimeWarning): + c_old = Categorical([0,1,2,0,1,2], categories=["a","b","c"]) + + with tm.assert_produces_warning(RuntimeWarning): + c_old = Categorical([0,1,2,0,1,2], categories=[3,4,5]) + + # the next one are from the old docs, but unfortunately these don't trigger :-( + with tm.assert_produces_warning(None): + c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) + cat = Categorical([1,2], categories=[1,2,3]) + def test_constructor_with_generator(self): # This was raising an Error in isnull(single_val).any() because isnull returned a scalar # for a generator @@ -189,16 +191,16 @@ def test_constructor_with_generator(self): from pandas.core.index import MultiIndex MultiIndex.from_product([range(5), ['a', 'b', 'c']]) - # check that levels accept generators and sequences - cat = pd.Categorical([0,1,2], levels=(x for x in [0,1,2])) + # check that categories accept generators and sequences + cat = pd.Categorical([0,1,2], categories=(x for x in [0,1,2])) self.assertTrue(cat.equals(exp)) - cat = pd.Categorical([0,1,2], levels=xrange(3)) + cat = pd.Categorical([0,1,2], categories=xrange(3)) self.assertTrue(cat.equals(exp)) def test_from_codes(self): - # too few levels + # too few categories def f(): Categorical.from_codes([1,2], [1,2]) self.assertRaises(ValueError, f) @@ -208,7 +210,7 @@ def f(): Categorical.from_codes(["a"], [1,2]) self.assertRaises(ValueError, f) - # no unique levels + # no unique categories def f(): Categorical.from_codes([0,1,2], ["a","a","b"]) self.assertRaises(ValueError, f) @@ -226,7 +228,7 @@ def f(): # Not available in earlier numpy versions if hasattr(np.random, "choice"): codes = np.random.choice([0,1], 5, p=[0.9,0.1]) - pd.Categorical.from_codes(codes, levels=["train", "test"]) + pd.Categorical.from_codes(codes, categories=["train", "test"]) def test_comparisons(self): result = self.factor[self.factor == 'a'] @@ -265,12 +267,12 @@ def test_comparisons(self): self.assert_numpy_array_equal(result, expected) # comparisons with categoricals - cat_rev = pd.Categorical(["a","b","c"], levels=["c","b","a"]) - cat_rev_base = pd.Categorical(["b","b","b"], levels=["c","b","a"]) + cat_rev = pd.Categorical(["a","b","c"], categories=["c","b","a"]) + cat_rev_base = pd.Categorical(["b","b","b"], categories=["c","b","a"]) cat = pd.Categorical(["a","b","c"]) - cat_base = pd.Categorical(["b","b","b"], levels=cat.levels) + cat_base = pd.Categorical(["b","b","b"], categories=cat.categories) - # comparisons need to take level ordering into account + # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = np.array([True, False, False]) self.assert_numpy_array_equal(res_rev, exp_rev) @@ -283,12 +285,12 @@ def test_comparisons(self): exp = np.array([False, False, True]) self.assert_numpy_array_equal(res, exp) - # Only categories with same levels can be compared + # Only categories with same categories can be compared def f(): cat > cat_rev self.assertRaises(TypeError, f) - cat_rev_base2 = pd.Categorical(["b","b","b"], levels=["c","b","a","d"]) + cat_rev_base2 = pd.Categorical(["b","b","b"], categories=["c","b","a","d"]) def f(): cat_rev > cat_rev_base2 self.assertRaises(TypeError, f) @@ -320,19 +322,19 @@ def f(): self.assertRaises(TypeError, lambda: a < cat) self.assertRaises(TypeError, lambda: a < cat_rev) - def test_na_flags_int_levels(self): + def test_na_flags_int_categories(self): # #1457 - levels = lrange(10) + categories = lrange(10) labels = np.random.randint(0, 10, 20) labels[::5] = -1 - cat = Categorical(labels, levels, fastpath=True) + cat = Categorical(labels, categories, fastpath=True) repr(cat) self.assert_numpy_array_equal(com.isnull(cat), labels == -1) - def test_levels_none(self): + def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) self.assertTrue(factor.equals(self.factor)) @@ -342,27 +344,27 @@ def test_describe(self): desc = self.factor.describe() expected = DataFrame.from_dict(dict(counts=[3, 2, 3], freqs=[3/8., 2/8., 3/8.], - levels=['a', 'b', 'c']) - ).set_index('levels') + categories=['a', 'b', 'c']) + ).set_index('categories') tm.assert_frame_equal(desc, expected) - # check unused levels + # check unused categories cat = self.factor.copy() - cat.levels = ["a","b","c","d"] + cat.set_categories(["a","b","c","d"], inplace=True) desc = cat.describe() expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan], freqs=[3/8., 2/8., 3/8., np.nan], - levels=['a', 'b', 'c', 'd']) - ).set_index('levels') + categories=['a', 'b', 'c', 'd']) + ).set_index('categories') tm.assert_frame_equal(desc, expected) # check an integer one desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe() expected = DataFrame.from_dict(dict(counts=[5, 3, 3], freqs=[5/11., 3/11., 3/11.], - levels=[1,2,3] + categories=[1,2,3] ) - ).set_index('levels') + ).set_index('categories') tm.assert_frame_equal(desc, expected) # https://github.com/pydata/pandas/issues/3678 @@ -371,77 +373,75 @@ def test_describe(self): desc = cat.describe() expected = DataFrame.from_dict(dict(counts=[1, 2, 1], freqs=[1/4., 2/4., 1/4.], - levels=[1,2,np.nan] + categories=[1,2,np.nan] ) - ).set_index('levels') + ).set_index('categories') tm.assert_frame_equal(desc, expected) - # having NaN as level and as "not available" should also print two NaNs in describe! + # having NaN as category and as "not available" should also print two NaNs in describe! cat = pd.Categorical([np.nan,1, 2, 2]) - cat.levels = [1,2,np.nan] + cat.set_categories([1,2,np.nan], rename=True, inplace=True) desc = cat.describe() expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1], freqs=[1/4., 2/4., np.nan, 1/4.], - levels=[1,2,np.nan,np.nan] + categories=[1,2,np.nan,np.nan] ) - ).set_index('levels') + ).set_index('categories') tm.assert_frame_equal(desc, expected) - # empty levels show up as NA - cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + # empty categories show up as NA + cat = Categorical(["a","b","b","b"], categories=['a','b','c'], ordered=True) result = cat.describe() expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], columns=['counts','freqs'], - index=Index(['a','b','c'],name='levels')) + index=Index(['a','b','c'],name='categories')) tm.assert_frame_equal(result,expected) - # NA as a level - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) + # NA as a category + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) result = cat.describe() expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], columns=['counts','freqs'], - index=Index(['b','a','c',np.nan],name='levels')) + index=Index(['b','a','c',np.nan],name='categories')) tm.assert_frame_equal(result,expected) def test_print(self): - expected = [" a", " b", " b", " a", " a", " c", " c", " c", - "Levels (3, object): [a < b < c]"] + expected = ["[a, b, b, a, a, c, c, c]", + "Categories (3, object): [a < b < c]"] expected = "\n".join(expected) actual = repr(self.factor) self.assertEqual(actual, expected) def test_big_print(self): factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat', fastpath=True) - expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c", - " a", " b", " c", " a", "...", " c", " a", " b", " c", - " a", " b", " c", " a", " b", " c", " a", " b", " c", + expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Name: cat, Length: 600", - "Levels (3, object): [a, b, c]"] + "Categories (3, object): [a, b, c]"] expected = "\n".join(expected) actual = repr(factor) - self.assertEqual(expected, actual) + self.assertEqual(actual, expected) def test_empty_print(self): factor = Categorical([], ["a","b","c"], name="cat") - expected = ("Categorical([], Name: cat, Levels (3, object): [a < b < c]") + expected = ("[], Name: cat, Categories (3, object): [a < b < c]") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) self.assertEqual(actual, expected) factor = Categorical([], ["a","b","c"]) - expected = ("Categorical([], Levels (3, object): [a < b < c]") + expected = ("[], Categories (3, object): [a < b < c]") actual = repr(factor) self.assertEqual(expected, actual) factor = Categorical([], []) - expected = ("Categorical([], Levels (0, object): []") + expected = ("[], Categories (0, object): []") self.assertEqual(expected, repr(factor)) def test_periodindex(self): @@ -453,7 +453,7 @@ def test_periodindex(self): exp_arr = np.array([0, 0, 1, 1, 2, 2],dtype='int64') exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat1._codes, exp_arr) - self.assertTrue(cat1.levels.equals(exp_idx)) + self.assertTrue(cat1.categories.equals(exp_idx)) idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') @@ -462,7 +462,7 @@ def test_periodindex(self): exp_arr = np.array([2, 2, 1, 0, 2, 0],dtype='int64') exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat2._codes, exp_arr) - self.assertTrue(cat2.levels.equals(exp_idx2)) + self.assertTrue(cat2.categories.equals(exp_idx2)) idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') @@ -471,102 +471,241 @@ def test_periodindex(self): exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') self.assert_numpy_array_equal(cat3._codes, exp_arr) - self.assertTrue(cat3.levels.equals(exp_idx)) + self.assertTrue(cat3.categories.equals(exp_idx)) - def test_level_assigments(self): + def test_categories_assigments(self): s = pd.Categorical(["a","b","c","a"]) exp = np.array([1,2,3,1]) - s.levels = [1,2,3] + s.categories = [1,2,3] self.assert_numpy_array_equal(s.__array__(), exp) - self.assert_numpy_array_equal(s.levels, np.array([1,2,3])) + self.assert_numpy_array_equal(s.categories, np.array([1,2,3])) # lengthen - s.levels = [1,2,3,4] - # does nothing to the values but only the the levels - self.assert_numpy_array_equal(s.__array__(), exp) - self.assert_numpy_array_equal(s.levels, np.array([1,2,3,4])) + def f(): + s.categories = [1,2,3,4] + self.assertRaises(ValueError, f) # shorten - exp2 = np.array([1,2,np.nan,1]) - s.levels = [1,2] - self.assert_numpy_array_equivalent(s.__array__(), exp2) # doesn't work with nan :-( - self.assertTrue(np.isnan(s.__array__()[2])) - self.assert_numpy_array_equal(s.levels, np.array([1,2])) + def f(): + s.categories = [1,2] + self.assertRaises(ValueError, f) - def test_reorder_levels(self): + def test_set_categories(self): cat = Categorical(["a","b","c","a"], ordered=True) - exp_levels = np.array(["c","b","a"]) + exp_categories = np.array(["c","b","a"]) exp_values = np.array(["a","b","c","a"]) - cat.reorder_levels(["c","b","a"]) - self.assert_numpy_array_equal(cat.levels, exp_levels) + + res = cat.set_categories(["c","b","a"], inplace=True) + self.assert_numpy_array_equal(cat.categories, exp_categories) self.assert_numpy_array_equal(cat.__array__(), exp_values) + self.assertIsNone(res) + + res = cat.set_categories(["a","b","c"]) + # cat must be the same as before + self.assert_numpy_array_equal(cat.categories, exp_categories) + self.assert_numpy_array_equal(cat.__array__(), exp_values) + # only res is changed + exp_categories_back = np.array(["a","b","c"]) + self.assert_numpy_array_equal(res.categories, exp_categories_back) + self.assert_numpy_array_equal(res.__array__(), exp_values) + + # not all "old" included in "new" -> all not included ones are now np.nan + cat = Categorical(["a","b","c","a"], ordered=True) + res = cat.set_categories(["a"]) + self.assert_numpy_array_equal(res.codes, np.array([0,-1,-1,0])) + + # still not all "old" in "new" + res = cat.set_categories(["a","b","d"]) + self.assert_numpy_array_equal(res.codes, np.array([0,1,-1,0])) + self.assert_numpy_array_equal(res.categories, np.array(["a","b","d"])) + + # all "old" included in "new" + cat = cat.set_categories(["a","b","c","d"]) + exp_categories = np.array(["a","b","c","d"]) + self.assert_numpy_array_equal(cat.categories, exp_categories) + + # internals... + c = Categorical([1,2,3,4,1], categories=[1,2,3,4]) + self.assert_numpy_array_equal(c._codes, np.array([0,1,2,3,0])) + self.assert_numpy_array_equal(c.categories , np.array([1,2,3,4] )) + self.assert_numpy_array_equal(c.get_values(), np.array([1,2,3,4,1] )) + c = c.set_categories([4,3,2,1]) # all "pointers" to '4' must be changed from 3 to 0,... + self.assert_numpy_array_equal(c._codes, np.array([3,2,1,0,3])) # positions are changed + self.assert_numpy_array_equal(c.categories, np.array([4,3,2,1])) # categories are now in new order + self.assert_numpy_array_equal(c.get_values(), np.array([1,2,3,4,1])) # output is the same + self.assertTrue(c.min(), 4) + self.assertTrue(c.max(), 1) + + def test_rename_categories(self): + cat = pd.Categorical(["a","b","c","a"]) + + # inplace=False: the old one must not be changed + res = cat.rename_categories([1,2,3]) + self.assert_numpy_array_equal(res.__array__(), np.array([1,2,3,1])) + self.assert_numpy_array_equal(res.categories, np.array([1,2,3])) + self.assert_numpy_array_equal(cat.__array__(), np.array(["a","b","c","a"])) + self.assert_numpy_array_equal(cat.categories, np.array(["a","b","c"])) + res = cat.rename_categories([1,2,3], inplace=True) + + # and now inplace + self.assertIsNone(res) + self.assert_numpy_array_equal(cat.__array__(), np.array([1,2,3,1])) + self.assert_numpy_array_equal(cat.categories, np.array([1,2,3])) + + # lengthen + def f(): + cat.rename_categories([1,2,3,4]) + self.assertRaises(ValueError, f) + # shorten + def f(): + cat.rename_categories([1,2]) + self.assertRaises(ValueError, f) + + def test_reorder_categories(self): + cat = Categorical(["a","b","c","a"], ordered=True) + old = cat.copy() + new = Categorical(["a","b","c","a"], categories=["c","b","a"], ordered=True) + + # first inplace == False + res = cat.reorder_categories(["c","b","a"]) + # cat must be the same as before + self.assert_categorical_equal(cat, old) + # only res is changed + self.assert_categorical_equal(res, new) + + # inplace == True + res = cat.reorder_categories(["c","b","a"], inplace=True) + self.assertIsNone(res) + self.assert_categorical_equal(cat, new) # not all "old" included in "new" + cat = Categorical(["a","b","c","a"], ordered=True) def f(): - cat.reorder_levels(["a"]) + cat.reorder_categories(["a"]) self.assertRaises(ValueError, f) # still not all "old" in "new" def f(): - cat.reorder_levels(["a","b","d"]) + cat.reorder_categories(["a","b","d"]) self.assertRaises(ValueError, f) - # This works: all "old" included in "new" - cat.reorder_levels(["a","b","c","d"]) - exp_levels = np.array(["a","b","c","d"]) - self.assert_numpy_array_equal(cat.levels, exp_levels) + # all "old" included in "new", but too long + def f(): + cat.reorder_categories(["a","b","c","d"]) + self.assertRaises(ValueError, f) - # internals... - c = Categorical([1,2,3,4,1], levels=[1,2,3,4]) - self.assert_numpy_array_equal(c._codes, np.array([0,1,2,3,0])) - self.assert_numpy_array_equal(c.levels , np.array([1,2,3,4] )) - self.assert_numpy_array_equal(c.get_values() , np.array([1,2,3,4,1] )) - c.reorder_levels([4,3,2,1]) # all "pointers" to '4' must be changed from 3 to 0,... - self.assert_numpy_array_equal(c._codes , np.array([3,2,1,0,3])) # positions are changed - self.assert_numpy_array_equal(c.levels , np.array([4,3,2,1])) # levels are now in new order - self.assert_numpy_array_equal(c.get_values() , np.array([1,2,3,4,1])) # output is the same - self.assertTrue(c.min(), 4) - self.assertTrue(c.max(), 1) + def test_add_categories(self): + cat = Categorical(["a","b","c","a"], ordered=True) + old = cat.copy() + new = Categorical(["a","b","c","a"], categories=["a","b","c","d"], ordered=True) + + # first inplace == False + res = cat.add_categories("d") + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + res = cat.add_categories(["d"]) + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + # inplace == True + res = cat.add_categories("d", inplace=True) + self.assert_categorical_equal(cat, new) + self.assertIsNone(res) + # new is in old categories def f(): - c.reorder_levels([4,3,2,10]) + cat.add_categories(["d"]) self.assertRaises(ValueError, f) - def test_remove_unused_levels(self): - c = Categorical(["a","b","c","d","a"], levels=["a","b","c","d","e"]) - self.assert_numpy_array_equal(c.levels , np.array(["a","b","c","d","e"])) - c.remove_unused_levels() - self.assert_numpy_array_equal(c.levels , np.array(["a","b","c","d"])) + def test_remove_categories(self): + cat = Categorical(["a","b","c","a"], ordered=True) + old = cat.copy() + new = Categorical(["a","b",np.nan,"a"], categories=["a","b"], ordered=True) + + # first inplace == False + res = cat.remove_categories("c") + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + res = cat.remove_categories(["c"]) + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + # inplace == True + res = cat.remove_categories("c", inplace=True) + self.assert_categorical_equal(cat, new) + self.assertIsNone(res) + + # removal is not in categories + def f(): + cat.remove_categories(["c"]) + self.assertRaises(ValueError, f) + + def test_remove_unused_categories(self): + c = Categorical(["a","b","c","d","a"], categories=["a","b","c","d","e"]) + exp_categories_all = np.array(["a","b","c","d","e"]) + exp_categories_dropped = np.array(["a","b","c","d"]) + + self.assert_numpy_array_equal(c.categories, exp_categories_all) + + res = c.remove_unused_categories() + self.assert_numpy_array_equal(res.categories, exp_categories_dropped) + self.assert_numpy_array_equal(c.categories, exp_categories_all) + + res = c.remove_unused_categories(inplace=True) + self.assert_numpy_array_equal(c.categories, exp_categories_dropped) + self.assertIsNone(res) + def test_nan_handling(self): # Nans are represented as -1 in codes c = Categorical(["a","b",np.nan,"a"]) - self.assert_numpy_array_equal(c.levels , np.array(["a","b"])) + self.assert_numpy_array_equal(c.categories , np.array(["a","b"])) self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + c[1] = np.nan + self.assert_numpy_array_equal(c.categories , np.array(["a","b"])) + self.assert_numpy_array_equal(c._codes , np.array([0,-1,-1,0])) - # If levels have nan included, the code should point to that instead - c = Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan]) - self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_)) + # If categories have nan included, the code should point to that instead + c = Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan]) + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + c[1] = np.nan + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,2,2,0])) - # Changing levels should also make the replaced level np.nan + # Changing categories should also make the replaced category np.nan c = Categorical(["a","b","c","a"]) - c.levels = ["a","b",np.nan] - self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_)) + c.categories = ["a","b",np.nan] + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + # Adding nan to categories should make assigned nan point to the category! + c = Categorical(["a","b",np.nan,"a"]) + self.assert_numpy_array_equal(c.categories , np.array(["a","b"])) + self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + c.set_categories(["a","b",np.nan], rename=True, inplace=True) + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + c[1] = np.nan + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,2,-1,0])) + + def test_isnull(self): exp = np.array([False, False, True]) c = Categorical(["a","b",np.nan]) res = c.isnull() self.assert_numpy_array_equal(res, exp) - c = Categorical(["a","b",np.nan], levels=["a","b",np.nan]) + c = Categorical(["a","b",np.nan], categories=["a","b",np.nan]) res = c.isnull() self.assert_numpy_array_equal(res, exp) + # test both nan in categories and as -1 exp = np.array([True, False, True]) c = Categorical(["a","b",np.nan]) - c.levels = ["a","b",np.nan] + c.set_categories(["a","b",np.nan], rename=True, inplace=True) c[0] = np.nan res = c.isnull() self.assert_numpy_array_equal(res, exp) @@ -610,12 +749,12 @@ def test_min_max(self): _max = cat.max() self.assertEqual(_min, "a") self.assertEqual(_max, "d") - cat = Categorical(["a","b","c","d"], levels=['d','c','b','a'], ordered=True) + cat = Categorical(["a","b","c","d"], categories=['d','c','b','a'], ordered=True) _min = cat.min() _max = cat.max() self.assertEqual(_min, "d") self.assertEqual(_max, "a") - cat = Categorical([np.nan,"b","c",np.nan], levels=['d','c','b','a'], ordered=True) + cat = Categorical([np.nan,"b","c",np.nan], categories=['d','c','b','a'], ordered=True) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) @@ -626,7 +765,7 @@ def test_min_max(self): _max = cat.max(numeric_only=True) self.assertEqual(_max, "b") - cat = Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True) + cat = Categorical([np.nan,1,2,np.nan], categories=[5,4,3,2,1], ordered=True) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) @@ -637,32 +776,38 @@ def test_min_max(self): _max = cat.max(numeric_only=True) self.assertEqual(_max, 1) + def test_unique(self): + cat = Categorical(["a","b","c","d"]) + exp = np.asarray(["a","b","c","d"]) + res = cat.unique() + self.assert_numpy_array_equal(res, exp) + self.assertEqual(type(res), type(exp)) def test_mode(self): - s = Categorical([1,1,2,4,5,5,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([5], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([5], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([1,1,1,4,5,5,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([1,1,1,4,5,5,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([5,1], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([5,1], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([1,2,3,4,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([1,2,3,4,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) # NaN should not become the mode! - s = Categorical([np.nan,np.nan,np.nan,4,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([np.nan,np.nan,np.nan,4,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([np.nan,np.nan,np.nan,4,5,4], levels=[5,4,3,2,1], ordered=True) + s = Categorical([np.nan,np.nan,np.nan,4,5,4], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([4], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([4], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([np.nan,np.nan,4,5,4], levels=[5,4,3,2,1], ordered=True) + s = Categorical([np.nan,np.nan,4,5,4], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([4], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([4], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) @@ -678,7 +823,7 @@ def test_sort(self): exp = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) - cat = Categorical(["a","c","b","d"], levels=["a","b","c","d"], ordered=True) + cat = Categorical(["a","c","b","d"], categories=["a","b","c","d"], ordered=True) res = cat.order() exp = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) @@ -698,56 +843,68 @@ def test_slicing_directly(self): sliced = cat[3] tm.assert_equal(sliced, "d") sliced = cat[3:5] - expected = Categorical(["d","a"], levels=['a', 'b', 'c', 'd']) + expected = Categorical(["d","a"], categories=['a', 'b', 'c', 'd']) self.assert_numpy_array_equal(sliced._codes, expected._codes) - tm.assert_index_equal(sliced.levels, expected.levels) + tm.assert_index_equal(sliced.categories, expected.categories) def test_set_item_nan(self): cat = pd.Categorical([1,2,3]) - exp = pd.Categorical([1,np.nan,3], levels=[1,2,3]) + exp = pd.Categorical([1,np.nan,3], categories=[1,2,3]) cat[1] = np.nan self.assertTrue(cat.equals(exp)) - # if nan in levels, the proper code should be set! - cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) - cat.levels = [1,2,3, np.nan] + # if nan in categories, the proper code should be set! + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1] = np.nan exp = np.array([0,3,2,-1]) self.assert_numpy_array_equal(cat.codes, exp) - cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) - cat.levels = [1,2,3, np.nan] + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1:3] = np.nan exp = np.array([0,3,3,-1]) self.assert_numpy_array_equal(cat.codes, exp) - cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) - cat.levels = [1,2,3, np.nan] + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1:3] = [np.nan, 1] exp = np.array([0,3,0,-1]) self.assert_numpy_array_equal(cat.codes, exp) - cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) - cat.levels = [1,2,3, np.nan] + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1:3] = [np.nan, np.nan] exp = np.array([0,3,3,-1]) self.assert_numpy_array_equal(cat.codes, exp) - cat = pd.Categorical([1,2, np.nan, 3], levels=[1,2,3]) - cat.levels = [1,2,3, np.nan] + cat = pd.Categorical([1,2, np.nan, 3], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[pd.isnull(cat)] = np.nan exp = np.array([0,1,3,2]) self.assert_numpy_array_equal(cat.codes, exp) def test_deprecated_labels(self): - # labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier - cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) exp = cat.codes with tm.assert_produces_warning(FutureWarning): res = cat.labels self.assert_numpy_array_equal(res, exp) self.assertFalse(LooseVersion(pd.__version__) >= '0.18') + def test_deprecated_levels(self): + # TODO: levels is deprecated and should be removed in 0.18 or 2017, whatever is earlier + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + exp = cat.categories + with tm.assert_produces_warning(FutureWarning): + res = cat.levels + self.assert_numpy_array_equal(res, exp) + with tm.assert_produces_warning(FutureWarning): + res = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + self.assert_numpy_array_equal(res.categories, exp) + + self.assertFalse(LooseVersion(pd.__version__) >= '0.18') class TestCategoricalAsBlock(tm.TestCase): @@ -880,7 +1037,7 @@ def test_construction_series(self): # insert into frame with different index # GH 8076 index = pd.date_range('20000101', periods=3) - expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c'])) + expected = Series(Categorical(values=[np.nan,np.nan,np.nan],categories=['a', 'b', 'c'])) expected.index = index expected = DataFrame({'x': expected}) @@ -894,17 +1051,17 @@ def test_reindex(self): # reindexing to an invalid Categorical s = Series(['a', 'b', 'c'],dtype='category') result = s.reindex(index) - expected = Series(Categorical(values=[np.nan,np.nan,np.nan],levels=['a', 'b', 'c'])) + expected = Series(Categorical(values=[np.nan,np.nan,np.nan],categories=['a', 'b', 'c'])) expected.index = index tm.assert_series_equal(result, expected) # partial reindexing - expected = Series(Categorical(values=['b','c'],levels=['a', 'b', 'c'])) + expected = Series(Categorical(values=['b','c'],categories=['a', 'b', 'c'])) expected.index = [1,2] result = s.reindex([1,2]) tm.assert_series_equal(result, expected) - expected = Series(Categorical(values=['c',np.nan],levels=['a', 'b', 'c'])) + expected = Series(Categorical(values=['c',np.nan],categories=['a', 'b', 'c'])) expected.index = [2,3] result = s.reindex([2,3]) tm.assert_series_equal(result, expected) @@ -918,7 +1075,7 @@ def test_sideeffects_free(self): cat = Categorical(["a","b","c","a"]) s = pd.Series(cat, copy=True) self.assertFalse(s.cat is cat) - s.cat.levels = [1,2,3] + s.cat.categories = [1,2,3] exp_s = np.array([1,2,3,1]) exp_cat = np.array(["a","b","c","a"]) self.assert_numpy_array_equal(s.__array__(), exp_s) @@ -935,7 +1092,7 @@ def test_sideeffects_free(self): cat = Categorical(["a","b","c","a"]) s = pd.Series(cat) self.assertTrue(s.values is cat) - s.cat.levels = [1,2,3] + s.cat.categories = [1,2,3] exp_s = np.array([1,2,3,1]) self.assert_numpy_array_equal(s.__array__(), exp_s) self.assert_numpy_array_equal(cat.__array__(), exp_s) @@ -949,33 +1106,35 @@ def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a","b",np.nan,"a"])) - self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) + self.assert_numpy_array_equal(s.cat.categories, np.array(["a","b"])) self.assert_numpy_array_equal(s.values.codes, np.array([0,1,-1,0])) - # If levels have nan included, the label should point to that instead - s2 = Series(Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan])) - self.assert_numpy_array_equal(s2.cat.levels, + # If categories have nan included, the label should point to that instead + s2 = Series(Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan])) + self.assert_numpy_array_equal(s2.cat.categories, np.array(["a","b",np.nan], dtype=np.object_)) self.assert_numpy_array_equal(s2.values.codes, np.array([0,1,2,0])) - # Changing levels should also make the replaced level np.nan + # Changing categories should also make the replaced category np.nan s3 = Series(Categorical(["a","b","c","a"])) - s3.cat.levels = ["a","b",np.nan] - self.assert_numpy_array_equal(s3.cat.levels, + s3.cat.categories = ["a","b",np.nan] + self.assert_numpy_array_equal(s3.cat.categories, np.array(["a","b",np.nan], dtype=np.object_)) self.assert_numpy_array_equal(s3.values.codes, np.array([0,1,2,0])) def test_cat_accessor(self): s = Series(Categorical(["a","b",np.nan,"a"])) - self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) + self.assert_numpy_array_equal(s.cat.categories, np.array(["a","b"])) self.assertEqual(s.cat.ordered, True) - exp = Categorical(["a","b",np.nan,"a"], levels=["b","a"]) - s.cat.reorder_levels(["b", "a"]) + exp = Categorical(["a","b",np.nan,"a"], categories=["b","a"]) + s.cat.set_categories(["b", "a"], inplace=True) self.assertTrue(s.values.equals(exp)) - exp = Categorical(["a","b",np.nan,"a"], levels=["b","a"]) + res = s.cat.set_categories(["b", "a"]) + self.assertTrue(res.values.equals(exp)) + exp = Categorical(["a","b",np.nan,"a"], categories=["b","a"]) s[:] = "a" - s.cat.remove_unused_levels() - self.assert_numpy_array_equal(s.cat.levels, np.array(["a"])) + s = s.cat.remove_unused_categories() + self.assert_numpy_array_equal(s.cat.categories, np.array(["a"])) def test_sequence_like(self): @@ -1010,41 +1169,45 @@ def test_series_delegations(self): self.assertRaises(TypeError, lambda : Series(np.arange(5.)).cat) self.assertRaises(TypeError, lambda : Series([Timestamp('20130101')]).cat) - # Series should delegate calls to '.level', '.ordered' and '.reorder()' to the categorical + # Series should delegate calls to '.categories', '.codes', '.ordered' and the + # methods '.set_categories()' 'drop_unused_categories()' to the categorical s = Series(Categorical(["a","b","c","a"], ordered=True)) - exp_levels = np.array(["a","b","c"]) - self.assert_numpy_array_equal(s.cat.levels, exp_levels) + exp_categories = np.array(["a","b","c"]) + self.assert_numpy_array_equal(s.cat.categories, exp_categories) + s.cat.categories = [1,2,3] + exp_categories = np.array([1,2,3]) + self.assert_numpy_array_equal(s.cat.categories, exp_categories) + + exp_codes = Series(com._ensure_platform_int([0,1,2,0])) + tm.assert_series_equal(s.cat.codes, exp_codes) - s.cat.levels = [1,2,3] - exp_levels = np.array([1,2,3]) - self.assert_numpy_array_equal(s.cat.levels, exp_levels) self.assertEqual(s.cat.ordered, True) s.cat.ordered = False self.assertEqual(s.cat.ordered, False) # reorder s = Series(Categorical(["a","b","c","a"], ordered=True)) - exp_levels = np.array(["c","b","a"]) + exp_categories = np.array(["c","b","a"]) exp_values = np.array(["a","b","c","a"]) - s.cat.reorder_levels(["c","b","a"]) - self.assert_numpy_array_equal(s.cat.levels, exp_levels) + s = s.cat.set_categories(["c","b","a"]) + self.assert_numpy_array_equal(s.cat.categories, exp_categories) self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) - # remove unused levels - s = Series(Categorical(["a","b","b","a"], levels=["a","b","c"])) - exp_levels = np.array(["a","b"]) + # remove unused categories + s = Series(Categorical(["a","b","b","a"], categories=["a","b","c"])) + exp_categories = np.array(["a","b"]) exp_values = np.array(["a","b","b","a"]) - s.cat.remove_unused_levels() - self.assert_numpy_array_equal(s.cat.levels, exp_levels) + s = s.cat.remove_unused_categories() + self.assert_numpy_array_equal(s.cat.categories, exp_categories) self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error on wrong inputs: def f(): - s.reorder_levels([4,3,2,1]) + s.set_categories([4,3,2,1]) self.assertRaises(Exception, f) - # right: s.cat.reorder_levels([4,3,2,1]) + # right: s.cat.set_categories([4,3,2,1]) def test_series_functions_no_warnings(self): df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) @@ -1058,8 +1221,8 @@ def test_assignment_to_dataframe(self): labels = [ "{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500) ] df = df.sort(columns=['value'], ascending=True) - d = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) - s = Series(d) + s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + d = s.values df['D'] = d str(df) @@ -1081,9 +1244,9 @@ def test_assignment_to_dataframe(self): # sorting s.name = 'E' - self.assertTrue(result2.sort_index().equals(s)) + self.assertTrue(result2.sort_index().equals(s.sort_index())) - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) df = pd.DataFrame(pd.Series(cat)) def test_describe(self): @@ -1096,7 +1259,7 @@ def test_describe(self): # In a frame, describe() for the cat should be the same as for string arrays (count, unique, # top, freq) - cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + cat = Categorical(["a","b","b","b"], categories=['a','b','c'], ordered=True) s = Series(cat) result = s.describe() expected = Series([4,2,"b",3],index=['count','unique','top', 'freq']) @@ -1110,7 +1273,7 @@ def test_describe(self): def test_repr(self): a = pd.Series(pd.Categorical([1,2,3,4], name="a")) exp = u("0 1\n1 2\n2 3\n3 4\n" + - "Name: a, dtype: category\nLevels (4, int64): [1 < 2 < 3 < 4]") + "Name: a, dtype: category\nCategories (4, int64): [1 < 2 < 3 < 4]") self.assertEqual(exp, a.__unicode__()) @@ -1118,14 +1281,14 @@ def test_repr(self): exp = u("".join(["%s a\n%s b\n"%(i,i+1) for i in range(0,10,2)]) + "...\n" + "".join(["%s a\n%s b\n"%(i,i+1) for i in range(40,50,2)]) + "Name: a, Length: 50, dtype: category\n" + - "Levels (2, object): [a < b]") + "Categories (2, object): [a < b]") self.assertEqual(exp,a._tidy_repr()) levs = list("abcdefghijklmnopqrstuvwxyz") - a = pd.Series(pd.Categorical(["a","b"], name="a", levels=levs)) + a = pd.Series(pd.Categorical(["a","b"], name="a", categories=levs)) exp = u("0 a\n1 b\n" + "Name: a, dtype: category\n" - "Levels (26, object): [a < b < c < d ... w < x < y < z]") + "Categories (26, object): [a < b < c < d ... w < x < y < z]") self.assertEqual(exp,a.__unicode__()) @@ -1152,41 +1315,41 @@ def test_min_max(self): self.assertEqual(_min, "a") self.assertEqual(_max, "d") - cat = Series(Categorical(["a","b","c","d"], levels=['d','c','b','a'], ordered=True)) + cat = Series(Categorical(["a","b","c","d"], categories=['d','c','b','a'], ordered=True)) _min = cat.min() _max = cat.max() self.assertEqual(_min, "d") self.assertEqual(_max, "a") - cat = Series(Categorical([np.nan,"b","c",np.nan], levels=['d','c','b','a'], ordered=True)) + cat = Series(Categorical([np.nan,"b","c",np.nan], categories=['d','c','b','a'], ordered=True)) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) self.assertEqual(_max, "b") - cat = Series(Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True)) + cat = Series(Categorical([np.nan,1,2,np.nan], categories=[5,4,3,2,1], ordered=True)) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) self.assertEqual(_max, 1) def test_mode(self): - s = Series(Categorical([1,1,2,4,5,5,5], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True)) res = s.mode() - exp = Series(Categorical([5], levels=[5,4,3,2,1], ordered=True)) + exp = Series(Categorical([5], categories=[5,4,3,2,1], ordered=True)) tm.assert_series_equal(res, exp) - s = Series(Categorical([1,1,1,4,5,5,5], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([1,1,1,4,5,5,5], categories=[5,4,3,2,1], ordered=True)) res = s.mode() - exp = Series(Categorical([5,1], levels=[5,4,3,2,1], ordered=True)) + exp = Series(Categorical([5,1], categories=[5,4,3,2,1], ordered=True)) tm.assert_series_equal(res, exp) - s = Series(Categorical([1,2,3,4,5], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([1,2,3,4,5], categories=[5,4,3,2,1], ordered=True)) res = s.mode() - exp = Series(Categorical([], levels=[5,4,3,2,1], ordered=True)) + exp = Series(Categorical([], categories=[5,4,3,2,1], ordered=True)) tm.assert_series_equal(res, exp) def test_value_counts(self): - s = pd.Series(pd.Categorical(["a","b","c","c","c","b"], levels=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a","b","c","c","c","b"], categories=["c","a","b","d"])) res = s.value_counts(sort=False) exp = Series([3,1,2,0], index=["c","a","b","d"]) tm.assert_series_equal(res, exp) @@ -1196,15 +1359,15 @@ def test_value_counts(self): def test_groupby(self): - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"]) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"]) data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) expected = DataFrame({ 'a' : Series([1,2,4,np.nan],index=Index(['a','b','c','d'],name='b')) }) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) - raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"]) - raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"]) + raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"]) + raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"]) df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) # single grouper @@ -1236,8 +1399,8 @@ def test_groupby(self): def test_pivot_table(self): - raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"]) - raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"]) + raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"]) + raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"]) df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) result = pd.pivot_table(df, values='values', index=['A', 'B']) @@ -1248,7 +1411,7 @@ def test_pivot_table(self): def test_count(self): - s = Series(Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([np.nan,1,2,np.nan], categories=[5,4,3,2,1], ordered=True)) result = s.count() self.assertEqual(result, 2) @@ -1264,7 +1427,7 @@ def test_sort(self): exp = np.array(["a","b","c","d"]) self.assert_numpy_array_equal(res.__array__(), exp) - cat = Series(Categorical(["a","c","b","d"], levels=["a","b","c","d"], ordered=True)) + cat = Series(Categorical(["a","c","b","d"], categories=["a","b","c","d"], ordered=True)) res = cat.order() exp = np.array(["a","b","c","d"]) self.assert_numpy_array_equal(res.__array__(), exp) @@ -1273,8 +1436,8 @@ def test_sort(self): exp = np.array(["d","c","b","a"]) self.assert_numpy_array_equal(res.__array__(), exp) - raw_cat1 = Categorical(["a","b","c","d"], levels=["a","b","c","d"], ordered=False) - raw_cat2 = Categorical(["a","b","c","d"], levels=["d","c","b","a"]) + raw_cat1 = Categorical(["a","b","c","d"], categories=["a","b","c","d"], ordered=False) + raw_cat2 = Categorical(["a","b","c","d"], categories=["d","c","b","a"]) s = ["a","b","c","d"] df = DataFrame({"unsort":raw_cat1,"sort":raw_cat2, "string":s, "values":[1,2,3,4]}) @@ -1298,9 +1461,9 @@ def f(): # GH 7848 df = DataFrame({"id":[6,5,4,3,2,1], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) df["grade"] = pd.Categorical(df["raw_grade"]) - df['grade'].cat.reorder_levels(['b', 'e', 'a']) + df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a']) - # sorts 'grade' according to the order of the levels + # sorts 'grade' according to the order of the categories result = df.sort(columns=['grade']) expected = df.iloc[[1,2,5,0,3,4]] tm.assert_frame_equal(result,expected) @@ -1314,39 +1477,39 @@ def f(): cat = Categorical(["a","c","c","b","d"], ordered=True) res = cat.order(ascending=False) exp_val = np.array(["d","c", "c", "b","a"],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.categories, exp_categories) # some NaN positions cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='last') exp_val = np.array(["d","c","b","a", np.nan],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.categories, exp_categories) cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='first') exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.categories, exp_categories) cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='first') exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.categories, exp_categories) cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='last') exp_val = np.array(["d","c","b","a",np.nan],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.categories, exp_categories) def test_slicing(self): cat = Series(Categorical([1,2,3,4])) @@ -1380,13 +1543,13 @@ def test_slicing_and_getting_ops(self): # - returning a row # - returning a single value - cats = pd.Categorical(["a","c","b","c","c","c","c"], levels=["a","b","c"]) + cats = pd.Categorical(["a","c","b","c","c","c","c"], categories=["a","b","c"]) idx = pd.Index(["h","i","j","k","l","m","n"]) values= [1,2,3,4,5,6,7] df = pd.DataFrame({"cats":cats,"values":values}, index=idx) # the expected values - cats2 = pd.Categorical(["b","c"], levels=["a","b","c"]) + cats2 = pd.Categorical(["b","c"], categories=["a","b","c"]) idx2 = pd.Index(["j","k"]) values2= [3,4] @@ -1511,13 +1674,13 @@ def test_slicing_and_getting_ops(self): def test_slicing_doc_examples(self): #GH 7918 - cats = Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c"]) + cats = Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c"]) idx = Index(["h","i","j","k","l","m","n",]) values= [1,2,2,2,3,4,5] df = DataFrame({"cats":cats,"values":values}, index=idx) result = df.iloc[2:4,:] - expected = DataFrame({"cats":Categorical(['b','b'],levels=['a','b','c']),"values":[2,2]}, index=['j','k']) + expected = DataFrame({"cats":Categorical(['b','b'],categories=['a','b','c']),"values":[2,2]}, index=['j','k']) tm.assert_frame_equal(result, expected) result = df.iloc[2:4,:].dtypes @@ -1525,50 +1688,50 @@ def test_slicing_doc_examples(self): tm.assert_series_equal(result, expected) result = df.loc["h":"j","cats"] - expected = Series(Categorical(['a','b','b'],levels=['a','b','c']),index=['h','i','j']) + expected = Series(Categorical(['a','b','b'],categories=['a','b','c']),index=['h','i','j']) tm.assert_series_equal(result, expected) result = df.ix["h":"j",0:1] - expected = DataFrame({'cats' : Series(Categorical(['a','b','b'],levels=['a','b','c']),index=['h','i','j']) }) + expected = DataFrame({'cats' : Series(Categorical(['a','b','b'],categories=['a','b','c']),index=['h','i','j']) }) tm.assert_frame_equal(result, expected) def test_assigning_ops(self): # systematically test the assigning operations: # for all slicing ops: - # for value in levels and value not in levels: + # for value in categories and value not in categories: # - assign a single value -> exp_single_cats_value # - assign a complete row (mixed values) -> exp_single_row # - assign multiple rows (mixed values) (-> array) -> exp_multi_row # - assign a part of a column with dtype == categorical -> exp_parts_cats_col # - assign a part of a column with dtype != categorical -> exp_parts_cats_col - cats = pd.Categorical(["a","a","a","a","a","a","a"], levels=["a","b"]) + cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) idx = pd.Index(["h","i","j","k","l","m","n"]) values = [1,1,1,1,1,1,1] orig = pd.DataFrame({"cats":cats,"values":values}, index=idx) ### the expected values # changed single row - cats1 = pd.Categorical(["a","a","b","a","a","a","a"], levels=["a","b"]) + cats1 = pd.Categorical(["a","a","b","a","a","a","a"], categories=["a","b"]) idx1 = pd.Index(["h","i","j","k","l","m","n"]) values1 = [1,1,2,1,1,1,1] exp_single_row = pd.DataFrame({"cats":cats1,"values":values1}, index=idx1) #changed multiple rows - cats2 = pd.Categorical(["a","a","b","b","a","a","a"], levels=["a","b"]) + cats2 = pd.Categorical(["a","a","b","b","a","a","a"], categories=["a","b"]) idx2 = pd.Index(["h","i","j","k","l","m","n"]) values2 = [1,1,2,2,1,1,1] exp_multi_row = pd.DataFrame({"cats":cats2,"values":values2}, index=idx2) # changed part of the cats column - cats3 = pd.Categorical(["a","a","b","b","a","a","a"], levels=["a","b"]) + cats3 = pd.Categorical(["a","a","b","b","a","a","a"], categories=["a","b"]) idx3 = pd.Index(["h","i","j","k","l","m","n"]) values3 = [1,1,1,1,1,1,1] exp_parts_cats_col = pd.DataFrame({"cats":cats3,"values":values3}, index=idx3) # changed single value in cats col - cats4 = pd.Categorical(["a","a","b","a","a","a","a"], levels=["a","b"]) + cats4 = pd.Categorical(["a","a","b","a","a","a","a"], categories=["a","b"]) idx4 = pd.Index(["h","i","j","k","l","m","n"]) values4 = [1,1,1,1,1,1,1] exp_single_cats_value = pd.DataFrame({"cats":cats4,"values":values4}, index=idx4) @@ -1586,7 +1749,7 @@ def test_assigning_ops(self): tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + # - assign a single value not in the current categories set def f(): df = orig.copy() df.iloc[2,0] = "c" @@ -1597,7 +1760,7 @@ def f(): df.iloc[2,:] = ["b",2] tm.assert_frame_equal(df, exp_single_row) - # - assign a complete row (mixed values) not in level set + # - assign a complete row (mixed values) not in categories set def f(): df = orig.copy() df.iloc[2,:] = ["c",2] @@ -1615,18 +1778,18 @@ def f(): # - assign a part of a column with dtype == categorical -> exp_parts_cats_col df = orig.copy() - df.iloc[2:4,0] = pd.Categorical(["b","b"], levels=["a","b"]) + df.iloc[2:4,0] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): - # different levels -> not sure if this should fail or pass + # different categories -> not sure if this should fail or pass df = orig.copy() - df.iloc[2:4,0] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.iloc[2:4,0] = pd.Categorical(["b","b"], categories=["a","b","c"]) with tm.assertRaises(ValueError): # different values df = orig.copy() - df.iloc[2:4,0] = pd.Categorical(["c","c"], levels=["a","b","c"]) + df.iloc[2:4,0] = pd.Categorical(["c","c"], categories=["a","b","c"]) # - assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() @@ -1647,7 +1810,7 @@ def f(): df.loc[df.index == "j","cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + # - assign a single value not in the current categories set def f(): df = orig.copy() df.loc["j","cats"] = "c" @@ -1658,7 +1821,7 @@ def f(): df.loc["j",:] = ["b",2] tm.assert_frame_equal(df, exp_single_row) - # - assign a complete row (mixed values) not in level set + # - assign a complete row (mixed values) not in categories set def f(): df = orig.copy() df.loc["j",:] = ["c",2] @@ -1676,18 +1839,18 @@ def f(): # - assign a part of a column with dtype == categorical -> exp_parts_cats_col df = orig.copy() - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): - # different levels -> not sure if this should fail or pass + # different categories -> not sure if this should fail or pass df = orig.copy() - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) with tm.assertRaises(ValueError): # different values df = orig.copy() - df.loc["j":"k","cats"] = pd.Categorical(["c","c"], levels=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["c","c"], categories=["a","b","c"]) # - assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() @@ -1708,7 +1871,7 @@ def f(): df.ix[df.index == "j",0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + # - assign a single value not in the current categories set def f(): df = orig.copy() df.ix["j",0] = "c" @@ -1719,7 +1882,7 @@ def f(): df.ix["j",:] = ["b",2] tm.assert_frame_equal(df, exp_single_row) - # - assign a complete row (mixed values) not in level set + # - assign a complete row (mixed values) not in categories set def f(): df = orig.copy() df.ix["j",:] = ["c",2] @@ -1737,18 +1900,18 @@ def f(): # - assign a part of a column with dtype == categorical -> exp_parts_cats_col df = orig.copy() - df.ix["j":"k",0] = pd.Categorical(["b","b"], levels=["a","b"]) + df.ix["j":"k",0] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): - # different levels -> not sure if this should fail or pass + # different categories -> not sure if this should fail or pass df = orig.copy() - df.ix["j":"k",0] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.ix["j":"k",0] = pd.Categorical(["b","b"], categories=["a","b","c"]) with tm.assertRaises(ValueError): # different values df = orig.copy() - df.ix["j":"k",0] = pd.Categorical(["c","c"], levels=["a","b","c"]) + df.ix["j":"k",0] = pd.Categorical(["c","c"], categories=["a","b","c"]) # - assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() @@ -1763,7 +1926,7 @@ def f(): df.iat[2,0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + # - assign a single value not in the current categories set def f(): df = orig.copy() df.iat[2,0] = "c" @@ -1775,20 +1938,20 @@ def f(): df.at["j","cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + # - assign a single value not in the current categories set def f(): df = orig.copy() df.at["j","cats"] = "c" self.assertRaises(ValueError, f) # fancy indexing - catsf = pd.Categorical(["a","a","c","c","a","a","a"], levels=["a","b","c"]) + catsf = pd.Categorical(["a","a","c","c","a","a","a"], categories=["a","b","c"]) idxf = pd.Index(["h","i","j","k","l","m","n"]) valuesf = [1,1,3,3,1,1,1] df = pd.DataFrame({"cats":catsf,"values":valuesf}, index=idxf) exp_fancy = exp_multi_row.copy() - exp_fancy["cats"].cat.levels = ["a","b","c"] + exp_fancy["cats"].cat.set_categories(["a","b","c"], inplace=True) df[df["cats"] == "c"] = ["b",2] tm.assert_frame_equal(df, exp_multi_row) @@ -1806,36 +1969,36 @@ def f(): # Assigning a Category to parts of a int/... column uses the values of the Catgorical df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) exp = pd.DataFrame({"a":[1,"b","b",1,1], "b":["a","a","b","b","a"]}) - df.loc[1:2,"a"] = pd.Categorical(["b","b"], levels=["a","b"]) - df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp) ######### Series ########## - orig = Series(pd.Categorical(["b","b"], levels=["a","b"])) + orig = Series(pd.Categorical(["b","b"], categories=["a","b"])) s = orig.copy() s[:] = "a" - exp = Series(pd.Categorical(["a","a"], levels=["a","b"])) + exp = Series(pd.Categorical(["a","a"], categories=["a","b"])) tm.assert_series_equal(s, exp) s = orig.copy() s[1] = "a" - exp = Series(pd.Categorical(["b","a"], levels=["a","b"])) + exp = Series(pd.Categorical(["b","a"], categories=["a","b"])) tm.assert_series_equal(s, exp) s = orig.copy() s[s.index > 0] = "a" - exp = Series(pd.Categorical(["b","a"], levels=["a","b"])) + exp = Series(pd.Categorical(["b","a"], categories=["a","b"])) tm.assert_series_equal(s, exp) s = orig.copy() s[[False, True]] = "a" - exp = Series(pd.Categorical(["b","a"], levels=["a","b"])) + exp = Series(pd.Categorical(["b","a"], categories=["a","b"])) tm.assert_series_equal(s, exp) s = orig.copy() s.index = ["x", "y"] s["y"] = "a" - exp = Series(pd.Categorical(["b","a"], levels=["a","b"]), index=["x", "y"]) + exp = Series(pd.Categorical(["b","a"], categories=["a","b"]), index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan @@ -1849,14 +2012,14 @@ def test_comparisons(self): tests_data = [(list("abc"), list("cba"), list("bbb")), ([1,2,3], [3,2,1], [2,2,2])] for data , reverse, base in tests_data: - cat_rev = pd.Series(pd.Categorical(data, levels=reverse)) - cat_rev_base = pd.Series(pd.Categorical(base, levels=reverse)) + cat_rev = pd.Series(pd.Categorical(data, categories=reverse)) + cat_rev_base = pd.Series(pd.Categorical(base, categories=reverse)) cat = pd.Series(pd.Categorical(data)) - cat_base = pd.Series(pd.Categorical(base, levels=cat.cat.levels)) + cat_base = pd.Series(pd.Categorical(base, categories=cat.cat.categories)) s = Series(base) a = np.array(base) - # comparisons need to take level ordering into account + # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = Series([True, False, False]) tm.assert_series_equal(res_rev, exp_rev) @@ -1869,7 +2032,7 @@ def test_comparisons(self): exp = Series([False, False, True]) tm.assert_series_equal(res, exp) - # Only categories with same levels can be compared + # Only categories with same categories can be compared def f(): cat > cat_rev self.assertRaises(TypeError, f) @@ -1898,73 +2061,73 @@ def f(): tm.assert_series_equal(res, exp) def test_concat(self): - cat = pd.Categorical(["a","b"], levels=["a","b"]) + cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] df = pd.DataFrame({"cats":cat, "vals":vals}) - cat2 = pd.Categorical(["a","b","a","b"], levels=["a","b"]) + cat2 = pd.Categorical(["a","b","a","b"], categories=["a","b"]) vals2 = [1,2,1,2] exp = pd.DataFrame({"cats":cat2, "vals":vals2}, index=pd.Index([0, 1, 0, 1])) res = pd.concat([df,df]) tm.assert_frame_equal(exp, res) - # Concat should raise if the two categoricals do not have the same levels - cat3 = pd.Categorical(["a","b"], levels=["a","b","c"]) + # Concat should raise if the two categoricals do not have the same categories + cat3 = pd.Categorical(["a","b"], categories=["a","b","c"]) vals3 = [1,2] - df_wrong_levels = pd.DataFrame({"cats":cat3, "vals":vals3}) + df_wrong_categories = pd.DataFrame({"cats":cat3, "vals":vals3}) def f(): - pd.concat([df,df_wrong_levels]) + pd.concat([df,df_wrong_categories]) self.assertRaises(ValueError, f) # GH 7864 # make sure ordering is preserverd df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) df["grade"] = pd.Categorical(df["raw_grade"]) - df['grade'].cat.reorder_levels(['e', 'a', 'b']) + df['grade'].cat.set_categories(['e', 'a', 'b']) df1 = df[0:3] df2 = df[3:] - self.assert_numpy_array_equal(df['grade'].cat.levels, df1['grade'].cat.levels) - self.assert_numpy_array_equal(df['grade'].cat.levels, df2['grade'].cat.levels) + self.assert_numpy_array_equal(df['grade'].cat.categories, df1['grade'].cat.categories) + self.assert_numpy_array_equal(df['grade'].cat.categories, df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) - dfx['grade'].cat.levels - self.assert_numpy_array_equal(df['grade'].cat.levels, dfx['grade'].cat.levels) + dfx['grade'].cat.categories + self.assert_numpy_array_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) def test_append(self): - cat = pd.Categorical(["a","b"], levels=["a","b"]) + cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] df = pd.DataFrame({"cats":cat, "vals":vals}) - cat2 = pd.Categorical(["a","b","a","b"], levels=["a","b"]) + cat2 = pd.Categorical(["a","b","a","b"], categories=["a","b"]) vals2 = [1,2,1,2] exp = pd.DataFrame({"cats":cat2, "vals":vals2}, index=pd.Index([0, 1, 0, 1])) res = df.append(df) tm.assert_frame_equal(exp, res) - # Concat should raise if the two categoricals do not have the same levels - cat3 = pd.Categorical(["a","b"], levels=["a","b","c"]) + # Concat should raise if the two categoricals do not have the same categories + cat3 = pd.Categorical(["a","b"], categories=["a","b","c"]) vals3 = [1,2] - df_wrong_levels = pd.DataFrame({"cats":cat3, "vals":vals3}) + df_wrong_categories = pd.DataFrame({"cats":cat3, "vals":vals3}) def f(): - df.append(df_wrong_levels) + df.append(df_wrong_categories) self.assertRaises(ValueError, f) def test_na_actions(self): - cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) + cat = pd.Categorical([1,2,3,np.nan], categories=[1,2,3]) vals = ["a","b",np.nan,"d"] df = pd.DataFrame({"cats":cat, "vals":vals}) - cat2 = pd.Categorical([1,2,3,3], levels=[1,2,3]) + cat2 = pd.Categorical([1,2,3,3], categories=[1,2,3]) vals2 = ["a","b","b","d"] df_exp_fill = pd.DataFrame({"cats":cat2, "vals":vals2}) - cat3 = pd.Categorical([1,2,3], levels=[1,2,3]) + cat3 = pd.Categorical([1,2,3], categories=[1,2,3]) vals3 = ["a","b",np.nan] df_exp_drop_cats = pd.DataFrame({"cats":cat3, "vals":vals3}) - cat4 = pd.Categorical([1,2], levels=[1,2,3]) + cat4 = pd.Categorical([1,2], categories=[1,2,3]) vals4 = ["a","b"] df_exp_drop_all = pd.DataFrame({"cats":cat4, "vals":vals4}) @@ -1985,9 +2148,9 @@ def f(): res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) - # make sure that fillna takes both missing values and NA levels into account + # make sure that fillna takes both missing values and NA categories into account c = Categorical(["a","b",np.nan]) - c.levels = ["a","b",np.nan] + c.set_categories(["a","b",np.nan], rename=True, inplace=True) c[0] = np.nan df = pd.DataFrame({"cats":c, "vals":[1,2,3]}) df_exp = pd.DataFrame({"cats": Categorical(["a","b","a"]), "vals": [1,2,3]}) @@ -2046,7 +2209,9 @@ def test_numeric_like_ops(self): def test_cat_tab_completition(self): # test the tab completion display - ok_for_cat = ['levels','ordered','reorder_levels','remove_unused_levels'] + ok_for_cat = ['categories','codes','ordered','set_categories', + 'add_categories', 'remove_categories', 'rename_categories', + 'reorder_categories', 'remove_unused_categories'] def get_dir(s): results = [ r for r in s.cat.__dir__() if not r.startswith('_') ] return list(sorted(set(results))) @@ -2055,7 +2220,6 @@ def get_dir(s): results = get_dir(s) tm.assert_almost_equal(results,list(sorted(set(ok_for_cat)))) - if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 32a2a13849e2b..c6b5ff1769591 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1764,11 +1764,14 @@ def test_builtins_apply(self): # GH8155 df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), columns=['jim', 'joe']) df['jolie'] = np.random.randn(1000) + print(df.head()) for keys in ['jim', ['jim', 'joe']]: # single key & multi-key + if keys == 'jim': continue for f in [max, min, sum]: fname = f.__name__ result = df.groupby(keys).apply(f) + _shape = result.shape ngroups = len(df.drop_duplicates(subset=keys)) assert result.shape == (ngroups, 3), 'invalid frame shape: '\ '{} (expected ({}, 3))'.format(result.shape, ngroups) @@ -3267,10 +3270,11 @@ def test_groupby_categorical_no_compress(self): cats = Categorical.from_codes(codes, [0, 1, 2, 3]) result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.levels) + exp = data.groupby(codes).mean().reindex(cats.categories) assert_series_equal(result, exp) - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"]) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a","b","c","d"]) data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) result = data.groupby("b").mean() @@ -3313,7 +3317,8 @@ def test_groupby_categorical_unequal_len(self): import pandas as pd #GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) - bins = pd.cut(series.dropna(), 4) + # The raises only happens with categorical, not with series of types category + bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here self.assertRaises(ValueError,lambda : series.groupby(bins).mean()) @@ -4726,7 +4731,41 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) - + def test_groupby_categorical_two_columns(self): + + # https://github.com/pydata/pandas/issues/8138 + d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"]), + 'ints': [1, 1, 2, 2],'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') + exp = DataFrame({"ints":[1.5,1.5,np.nan], "val":[20,30,np.nan]}, + index=pd.Index(["a", "b", "c"], name="cat")) + tm.assert_frame_equal(res, exp) + + # Grouping on two columns + groups_double_key = test.groupby(["cat","ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val":[10,30,20,40,np.nan,np.nan], + "cat": ["a","a","b","b","c","c"], + "ints": [1,2,1,2,1,2]}).set_index(["cat","ints"]) + tm.assert_frame_equal(res, exp) + + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = test.groupby([values,'C2']) + + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"],[1,2,3,4]], + names=["cat", "C2"]) + exp = DataFrame({"C1":[nan,nan,nan,nan, 3, 3,nan,nan, nan,nan, 4, 5], + "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) + tm.assert_frame_equal(res, exp) def assert_fp_equal(a, b): diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 7b7446a86dd0b..c9935bf398cda 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -997,7 +997,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - levels = [Categorical.from_array(zp).levels for zp in zipped] + levels = [Categorical.from_array(zp).categories for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: @@ -1036,7 +1036,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): label_list.extend(concat_index.labels) else: factor = Categorical.from_array(concat_index) - levels.append(factor.levels) + levels.append(factor.categories) label_list.append(factor.codes) if len(names) == len(levels): diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 7390a4b11095b..3bdd49673ca71 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -76,12 +76,12 @@ def test_labels(self): result, bins = cut(arr, 4, retbins=True) ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]'] - self.assert_numpy_array_equal(result.levels, ex_levels) + self.assert_numpy_array_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)'] - self.assert_numpy_array_equal(result.levels, ex_levels) + self.assert_numpy_array_equal(result.categories, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -95,7 +95,7 @@ def test_label_precision(self): result = cut(arr, 4, precision=2) ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', '(0.54, 0.72]'] - self.assert_numpy_array_equal(result.levels, ex_levels) + self.assert_numpy_array_equal(result.categories, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -120,10 +120,10 @@ def test_inf_handling(self): result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) - ex_levels = ['(-inf, 2]', '(2, 4]', '(4, inf]'] + ex_categories = ['(-inf, 2]', '(2, 4]', '(4, inf]'] - np.testing.assert_array_equal(result.levels, ex_levels) - np.testing.assert_array_equal(result_ser.levels, ex_levels) + np.testing.assert_array_equal(result.categories, ex_categories) + np.testing.assert_array_equal(result_ser.cat.categories, ex_categories) self.assertEqual(result[5], '(4, inf]') self.assertEqual(result[0], '(-inf, 2]') self.assertEqual(result_ser[5], '(4, inf]') @@ -172,7 +172,7 @@ def test_cut_pass_labels(self): result = cut(arr, bins, labels=labels) exp = cut(arr, bins) - exp.levels = labels + exp.categories = labels self.assertTrue(result.equals(exp)) @@ -182,7 +182,7 @@ def test_qcut_include_lowest(self): cats = qcut(values, 4) ex_levels = ['[0, 2.25]', '(2.25, 4.5]', '(4.5, 6.75]', '(6.75, 9]'] - self.assertTrue((cats.levels == ex_levels).all()) + self.assertTrue((cats.categories == ex_levels).all()) def test_qcut_nas(self): arr = np.random.randn(100) @@ -216,7 +216,7 @@ def test_qcut_binning_issues(self): starts = [] ends = [] - for lev in result.levels: + for lev in result.categories: s, e = lev[1:-1].split(',') self.assertTrue(s != e) @@ -230,6 +230,25 @@ def test_qcut_binning_issues(self): self.assertTrue(ep < en) self.assertTrue(ep <= sn) + def test_cut_return_categorical(self): + from pandas import Categorical + s = Series([0,1,2,3,4,5,6,7,8]) + res = cut(s,3) + exp = Series(Categorical.from_codes([0,0,0,1,1,1,2,2,2], + ["(-0.008, 2.667]", "(2.667, 5.333]", "(5.333, 8]"], + ordered=True)) + tm.assert_series_equal(res, exp) + + def test_qcut_return_categorical(self): + from pandas import Categorical + s = Series([0,1,2,3,4,5,6,7,8]) + res = qcut(s,[0,0.333,0.666,1]) + exp = Series(Categorical.from_codes([0,0,0,1,1,1,2,2,2], + ["[0, 2.664]", "(2.664, 5.328]", "(5.328, 8]"], + ordered=True)) + tm.assert_series_equal(res, exp) + + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index b28f7c89606de..06fee377be749 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -34,7 +34,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, right == True (the default), then the bins [1,2,3,4] indicate (1,2], (2,3], (3,4]. labels : array or boolean, default None - Labels to use for bin edges, or False to return integer bin labels + Labels to use for bins, or False to return integer bin labels. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. @@ -45,7 +45,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Returns ------- - out : Categorical or array of integers if labels is False + out : Categorical or Series or array of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series of type category if + input is a Series else Categorical. bins : ndarray of floats Returned only if `retbins` is True. @@ -102,9 +104,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') - return _bins_to_cuts(x, bins, right=right, labels=labels, - retbins=retbins, precision=precision, - include_lowest=include_lowest) + res = _bins_to_cuts(x, bins, right=right, labels=labels,retbins=retbins, precision=precision, + include_lowest=include_lowest) + if isinstance(x, Series): + res = Series(res, index=x.index) + return res + def qcut(x, q, labels=None, retbins=False, precision=3): @@ -130,7 +135,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Returns ------- - cat : Categorical + cat : Categorical or Series + Returns a Series of type category if input is a Series else Categorical. Notes ----- @@ -144,8 +150,12 @@ def qcut(x, q, labels=None, retbins=False, precision=3): else: quantiles = q bins = algos.quantile(x, quantiles) - return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, - precision=precision, include_lowest=True) + res = _bins_to_cuts(x, bins, labels=labels, retbins=retbins,precision=precision, + include_lowest=True) + if isinstance(x, Series): + res = Series(res, index=x.index) + return res + def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, @@ -189,7 +199,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) - fac = Categorical(ids - 1, levels, name=name, fastpath=True) + fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True) else: fac = ids - 1 if has_nas: