From 3f282da6aac82fabf0749933fe8259a8bd6f33a6 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Mon, 22 Sep 2014 23:50:59 +0200 Subject: [PATCH] DOC: update to reflect changes in Categorical * rename of levels -> categories * don't use the `Categorical` constructor if thats not needed * introduce the new methods to manipulate categories --- doc/source/10min.rst | 8 +- doc/source/categorical.rst | 583 ++++++++++++++++++------------------- doc/source/v0.15.0.txt | 8 +- pandas/core/categorical.py | 11 +- 4 files changed, 295 insertions(+), 315 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 3a3b3d5e36977..6320be3920730 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -652,11 +652,11 @@ Since version 0.15, pandas can include categorical data in a ``DataFrame``. For # Alternative: df["grade"] = df["raw_grade"].astype("category") df["grade"] - # Rename the levels - df["grade"].cat.levels = ["very good", "good", "very bad"] + # Rename the categories inplace + df["grade"].cat.categories = ["very good", "good", "very bad"] - # Reorder the levels and simultaneously add the missing levels - df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + # Reorder the categories and simultaneously add the missing categories + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df["grade"] df.sort("grade") df.groupby("grade").size() diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 39ec42482722b..a5b00bbc4722f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -24,55 +24,51 @@ Categorical .. note:: While there was in `pandas.Categorical` in earlier versions, the ability to use - `Categorical` data in `Series` and `DataFrame` is new. + categorical data in `Series` and `DataFrame` is new. -This is a introduction to pandas :class:`pandas.Categorical` type, including a short comparison -with R's `factor`. +This is a introduction to pandas categorical data type, including a short comparison +with R's ``factor``. `Categoricals` are a pandas data type, which correspond to categorical variables in statistics: a variable, which can take on only a limited, and usually fixed, -number of possible values (commonly called `levels`). Examples are gender, social class, +number of possible values (`categories`; `categories` in R). Examples are gender, social class, blood types, country affiliations, observation time or ratings via Likert scales. -In contrast to statistical categorical variables, a `Categorical` might have an order (e.g. +In contrast to statistical categorical variables, categorical data might have an order (e.g. 'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical operations (additions, divisions, ...) are not possible. -All values of the `Categorical` are either in `levels` or `np.nan`. Order is defined by -the order of the `levels`, not lexical order of the values. Internally, the data structure -consists of a levels array and an integer array of `codes` which point to the real value in the -levels array. +All values of categorical data are either in `categories` or `np.nan`. Order is defined by +the order of `categories`, not lexical order of the values. Internally, the data structure +consists of a `categories` array and an integer array of `codes` which point to the real value in +the `categories` array. -`Categoricals` are useful in the following cases: +The categorical data type is useful in the following cases: * A string variable consisting of only a few different values. Converting such a string variable to a categorical variable will save some memory. * The lexical order of a variable is not the same as the logical order ("one", "two", "three"). - By converting to a categorical and specifying an order on the levels, sorting and + By converting to a categorical and specifying an order on the categories, sorting and min/max will use the logical order instead of the lexical order. * As a signal to other python libraries that this column should be treated as a categorical - variable (e.g. to use suitable statistical methods or plot types) + variable (e.g. to use suitable statistical methods or plot types). -See also the :ref:`API docs on Categoricals`. +See also the :ref:`API docs on categoricals`. Object Creation --------------- -Categorical `Series` or columns in a `DataFrame` can be crated in several ways: +Categorical `Series` or columns in a `DataFrame` can be created in several ways: -By passing a `Categorical` object to a `Series` or assigning it to a `DataFrame`: +By specifying ``dtype="category"`` when constructing a `Series`: .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"]) - s = pd.Series(raw_cat) + s = pd.Series(["a","b","c","a"], dtype="category") s - df = pd.DataFrame({"A":["a","b","c","a"]}) - df["B"] = raw_cat - df -By converting an existing `Series` or column to a ``category`` type: +By converting an existing `Series` or column to a ``category`` dtype: .. ipython:: python @@ -92,7 +88,21 @@ By using some special functions: See :ref:`documentation ` for :func:`~pandas.cut`. -`Categoricals` have a specific ``category`` :ref:`dtype `: +By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. +This is the only possibility to specify differently ordered categories (or no order at all) at +creation time and the only reason to use :class:`pandas.Categorical` directly: + +.. ipython:: python + + raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], + ordered=False) + s = pd.Series(raw_cat) + s + df = pd.DataFrame({"A":["a","b","c","a"]}) + df["B"] = raw_cat + df + +Categorical data has a specific ``category`` :ref:`dtype `: .. ipython:: python @@ -100,13 +110,13 @@ See :ref:`documentation ` for :func:`~pandas.cut`. .. note:: - In contrast to R's `factor` function, a `Categorical` is not converting input values to - string and levels will end up the same data type as the original values. + In contrast to R's `factor` function, categorical data is not converting input values to + strings and categories will end up the same data type as the original values. .. note:: In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `levels` to change the levels after creation time. + creation time. Use `categories` to change the categories after creation time. To get back to the original Series or `numpy` array, use ``Series.astype(original_dtype)`` or ``np.asarray(categorical)``: @@ -119,152 +129,145 @@ To get back to the original Series or `numpy` array, use ``Series.astype(origina s2 s3 = s2.astype('string') s3 - np.asarray(s2.cat) + np.asarray(s2) -If you have already `codes` and `levels`, you can use the :func:`~pandas.Categorical.from_codes` +If you have already `codes` and `categories`, you can use the :func:`~pandas.Categorical.from_codes` constructor to save the factorize step during normal constructor mode: .. ipython:: python splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - pd.Categorical.from_codes(splitter, levels=["train", "test"]) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Description ----------- -Using ``.describe()`` on a ``Categorical(...)`` or a ``Series(Categorical(...))`` will show -different output. - - -As part of a `Dataframe` or as a `Series` a similar output as for a `Series` of type ``string`` is -shown. Calling ``Categorical.describe()`` will show the frequencies for each level, with NA for -unused levels. +Using ``.describe()`` on categorical data will produce similar output to a `Series` or +`DataFrame` of type ``string``. .. ipython:: python - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) df = pd.DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) df.describe() - cat.describe() + df["cat"].describe() -Working with levels -------------------- +Working with categories +----------------------- -`Categoricals` have a `levels` property, which list their possible values. If you don't -manually specify levels, they are inferred from the passed in values. `Series` of type -``category`` expose the same interface via their `cat` property. +Categorical data has a `categories` and a `ordered` property, which list their possible values and +whether the ordering matters or not. These properties are exposed as ``s.cat.categories`` and +``s.cat.ordered``. If you don't manually specify categories and ordering, they are inferred from the +passed in values. .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"]) - raw_cat.levels - raw_cat.ordered - # Series of type "category" also expose these interface via the .cat property: - s = pd.Series(raw_cat) - s.cat.levels + s = pd.Series(["a","b","c","a"], dtype="category") + s.cat.categories s.cat.ordered -.. note:: - New `Categorical` are automatically ordered if the passed in values are sortable or a - `levels` argument is supplied. This is a difference to R's `factors`, which are unordered - unless explicitly told to be ordered (``ordered=TRUE``). - -It's also possible to pass in the levels in a specific order: +It's also possible to pass in the categories in a specific order: .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"], levels=["c","b","a"]) - s = pd.Series(raw_cat) - s.cat.levels + s = pd.Series(pd.Categorical(["a","b","c","a"], categories=["c","b","a"])) + s.cat.categories s.cat.ordered .. note:: - - Passing in a `levels` argument implies ``ordered=True``. You can of course overwrite that by + New categorical data is automatically ordered if the passed in values are sortable or a + `categories` argument is supplied. This is a difference to R's `factors`, which are unordered + unless explicitly told to be ordered (``ordered=TRUE``). You can of course overwrite that by passing in an explicit ``ordered=False``. -Any value omitted in the levels argument will be replaced by `np.nan`: -.. ipython:: python - - raw_cat = pd.Categorical(["a","b","c","a"], levels=["a","b"]) - s = pd.Series(raw_cat) - s.cat.levels - s +Renaming categories +******************* -Renaming levels is done by assigning new values to the ``Category.levels`` or -``Series.cat.levels`` property: +Renaming categories is done by assigning new values to the ``Series.cat.categories`` property or +by using the :func:`Categorical.rename_categories` method: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","a"])) - s - s.cat.levels = ["Group %s" % g for g in s.cat.levels] + s = pd.Series(["a","b","c","a"], dtype="category") s - s.cat.levels = [1,2,3] + s.cat.categories = ["Group %s" % g for g in s.cat.categories] s + s.cat.rename_categories([1,2,3]) .. note:: - I contrast to R's `factor`, a `Categorical` can have levels of other types than string. + In contrast to R's `factor`, categorical data can have categories of other types than string. -Levels must be unique or a `ValueError` is raised: +.. note:: + + Be aware that assigning new categories is an inplace operations, while most other operation + under ``Series.cat`` per default return a new Series of dtype `category`. + +Categories must be unique or a `ValueError` is raised: .. ipython:: python try: - s.cat.levels = [1,1,1] + s.cat.categories = [1,1,1] except ValueError as e: print("ValueError: " + str(e)) -Appending levels can be done by assigning a levels list longer than the current levels: +Appending new categories +************************ + +Appending categories can be done by using the :func:`Categorical.add_categories` method: .. ipython:: python - s.cat.levels = [1,2,3,4] - s.cat.levels + s = s.cat.add_categories([4]) + s.cat.categories s -.. note:: - Adding levels in other positions can be done with ``.reorder_levels()``. +Removing categories +******************* -Removing a level is also possible, but only the last level(s) can be removed by assigning a -shorter list than current levels. Values which are omitted are replaced by ``np.nan``. +Removing categories can be done by using the :func:`Categorical.remove_categories` method. Values +which are removed are replaced by ``np.nan``.: .. ipython:: python - s.cat.levels = [1,2] + s = s.cat.remove_categories([4]) s -.. note:: +Renaming unused categories +************************** - It's only possible to remove or add a level at the last position. If that's not where you want - to remove an old or add a new level, use ``Category.reorder_levels(new_order)`` or - ``Series.cat.reorder_levels(new_order)`` methods before or after. - -Removing unused levels can also be done: +Removing unused categories can also be done: .. ipython:: python - raw = pd.Categorical(["a","b","a"], levels=["a","b","c","d"]) - c = pd.Series(raw) - raw - raw.remove_unused_levels() - raw - c.cat.remove_unused_levels() - c + s = pd.Series(pd.Categorical(["a","b","a"], categories=["a","b","c","d"])) + s + s.cat.remove_unused_categories() -.. note:: +Setting categories +****************** + +If you want to do remove and add new categories in one step (which has some speed advantage), +or simply set the categories to a predefined scale, use :func:`Categorical.set_categories`. + +.. ipython:: python - In contrast to R's `factor` function, passing a `Categorical` as the sole input to the - `Categorical` constructor will *not* remove unused levels but create a new `Categorical` - which is equal to the passed in one! + s = pd.Series(["one","two","four", "-"], dtype="category") + s + s = s.cat.set_categories(["one","two","three","four"]) + s +.. note:: + Be aware that :func:`Categorical.set_categories` cannot know whether some category is omitted + intentionally or because it is misspelled or (under Python3) due to a type difference (e.g., + numpys S1 dtype and python strings). This can result in surprising behaviour! Ordered or not... ----------------- -If a `Categoricals` is ordered (``cat.ordered == True``), then the order of the levels has a +If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a meaning and certain operations are possible. If the categorical is unordered, a `TypeError` is raised. @@ -275,89 +278,81 @@ raised. s.sort() except TypeError as e: print("TypeError: " + str(e)) - s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=True)) + s = pd.Series(["a","b","c","a"], dtype="category") # ordered per default! s.sort() s - print(s.min(), s.max()) + s.min(), s.max() -.. note:: - ``ordered=True`` is not necessary needed in the second case, as lists of strings are sortable - and so the resulting `Categorical` is ordered. - -Sorting will use the order defined by levels, not any lexical order present on the data type. +Sorting will use the order defined by categories, not any lexical order present on the data type. This is even true for strings and numeric data: .. ipython:: python - s = pd.Series(pd.Categorical([1,2,3,1])) - s.cat.levels = [2,3,1] + s = pd.Series([1,2,3,1], dtype="category") + s.cat.categories = [2,3,1] s s.sort() s print(s.min(), s.max()) -Reordering the levels is possible via the ``Categorical.reorder_levels(new_levels)`` or -``Series.cat.reorder_levels(new_levels)`` methods. All old levels must be included in the new -levels. Note that per default, this operation returns a new Series and you need to specify -``inplace=True`` to do the change inplace! +Reordering the categories is possible via the :func:`Categorical.reorder_categories` and +the :func:`Categorical.set_categories` methods. For :func:`Categorical.reorder_categories`, all +old categories must be included in the new categories and no new categories are allowed. .. ipython:: python - s = pd.Series(pd.Categorical([1,2,3,1])) - s2 = s.cat.reorder_levels([2,3,1]) - s2 - s2.sort() - s2 - print(s2.min(), s2.max()) - + s = pd.Series([1,2,3,1], dtype="category") + s = s.cat.reorder_categories([2,3,1]) + s + s.sort() + s + print(s.min(), s.max()) .. note:: - Note the difference between assigning new level names and reordering the levels: the first - renames levels and therefore the individual values in the `Series`, but if the first + + Note the difference between assigning new categories and reordering the categories: the first + renames categories and therefore the individual values in the `Series`, but if the first position was sorted last, the renamed value will still be sorted last. Reordering means that the way values are sorted is different afterwards, but not that individual values in the `Series` are changed. -You can also add new levels with :func:`Categorical.reorder_levels`, as long as you include all -old levels: - -.. ipython:: python - - s = pd.Series(pd.Categorical(["a","b","d"])) - s3 = s.cat.reorder_levels(["a","b","c","d"]) - s3 +.. note:: + If the `Categorical` is not ordered, ``Series.min()`` and ``Series.max()`` will raise + `TypeError`. Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them + (e.g.``Series.median()``, which would need to compute the mean between two values if the length + of an array is even) do not work and raise a `TypeError`. -Operations ----------- -The following operations are possible with categorical data: +Comparisons +----------- Comparing `Categoricals` with other objects is possible in two cases: - * comparing a `Categorical` to another `Categorical`, when `level` and `ordered` is the same or - * comparing a `Categorical` to a scalar. + * comparing a categorical Series to another categorical Series, when `categories` and `ordered` is + the same or + * comparing a categorical Series to a scalar. All other comparisons will raise a TypeError. .. ipython:: python - cat = pd.Series(pd.Categorical([1,2,3], levels=[3,2,1])) - cat_base = pd.Series(pd.Categorical([2,2,2], levels=[3,2,1])) + cat = pd.Series(pd.Categorical([1,2,3], categories=[3,2,1])) + cat_base = pd.Series(pd.Categorical([2,2,2], categories=[3,2,1])) cat_base2 = pd.Series(pd.Categorical([2,2,2])) cat cat_base cat_base2 -Comparing to a categorical with the same levels and ordering or to a scalar works: +Comparing to a categorical with the same categories and ordering or to a scalar works: .. ipython:: python cat > cat_base cat > 2 -This doesn't work because the levels are not the same: +This doesn't work because the categories are not the same: .. ipython:: python @@ -368,10 +363,11 @@ This doesn't work because the levels are not the same: .. note:: - Comparisons with `Series`, `np.array` or a `Categorical` with different levels or ordering - will raise an `TypeError` because custom level ordering would result in two valid results: - one with taking in account the ordering and one without. If you want to compare a `Categorical` - with such a type, you need to be explicit and convert the `Categorical` to values: + Comparisons with `Series`, `np.array` or a `Categorical` with different categories or ordering + will raise an `TypeError` because custom categories ordering could be interpreted in two ways: + one with taking in account the ordering and one without. If you want to compare a categorical + series with such a type, you need to be explicit and convert the categorical data back to the + original values: .. ipython:: python @@ -384,50 +380,29 @@ This doesn't work because the levels are not the same: np.asarray(cat) > base -Getting the minimum and maximum, if the categorical is ordered: - -.. ipython:: python - - s = pd.Series(pd.Categorical(["a","b","c","a"], levels=["c","a","b","d"])) - print(s.min(), s.max()) - -.. note:: - - If the `Categorical` is not ordered, ``Categorical.min()`` and ``Categorical.max()`` and the - corresponding operations on `Series` will raise `TypeError`. - -The mode: - -.. ipython:: python - - raw_cat = pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"]) - s = pd.Series(raw_cat) - raw_cat.mode() - s.mode() - -.. note:: +Operations +---------- - Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them (e.g. - ``.median()``, which would need to compute the mean between two values if the length of an - array is even) do not work and raise a `TypeError`. +Apart from ``Series.min()``, ``Series.max()`` and ``Series.mode()``, the following operations are +possible with categorical data: -`Series` methods like `Series.value_counts()` will use all levels, even if some levels are not +`Series` methods like `Series.value_counts()` will use all categories, even if some categories are not present in the data: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a","b","c","c"], categories=["c","a","b","d"])) s.value_counts() -Groupby will also show "unused" levels: +Groupby will also show "unused" categories: .. ipython:: python - cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c","d"]) + cats = pd.Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) df.groupby("cats").mean() - cats2 = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) + cats2 = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) df2.groupby(["cats","B"]).mean() @@ -436,7 +411,7 @@ Pivot tables: .. ipython:: python - raw_cat = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) + raw_cat = pd.Categorical(["a","a","b","b"], categories=["a","b","c"]) df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) pd.pivot_table(df, values='values', index=['A', 'B']) @@ -445,7 +420,7 @@ Data munging The optimized pandas data access methods ``.loc``, ``.iloc``, ``.ix`` ``.at``, and ``.iat``, work as normal, the only difference is the return type (for getting) and -that only values already in the levels can be assigned. +that only values already in `categories` can be assigned. Getting ~~~~~~~ @@ -455,8 +430,8 @@ the ``category`` dtype is preserved. .. ipython:: python - cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c"]) idx = pd.Index(["h","i","j","k","l","m","n",]) + cats = pd.Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) values= [1,2,2,2,3,4,5] df = pd.DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] @@ -465,7 +440,7 @@ the ``category`` dtype is preserved. df.ix["h":"j",0:1] df[df["cats"] == "b"] -An example where the `Categorical` is not preserved is if you take one single row: the +An example where the category type is not preserved is if you take one single row: the resulting `Series` is of dtype ``object``: .. ipython:: python @@ -473,20 +448,20 @@ resulting `Series` is of dtype ``object``: # get the complete "h" row as a Series df.loc["h", :] -Returning a single item from a `Categorical` will also return the value, not a `Categorical` +Returning a single item from categorical data will also return the value, not a categorical of length "1". .. ipython:: python df.iat[0,0] - df["cats"].cat.levels = ["x","y","z"] + df["cats"].cat.categories = ["x","y","z"] df.at["h","cats"] # returns a string .. note:: This is a difference to R's `factor` function, where ``factor(c(1,2,3))[1]`` returns a single value `factor`. -To get a single value `Series` of type ``category`` pass in a single value list: +To get a single value `Series` of type ``category`` pass in a list with a single value: .. ipython:: python @@ -496,12 +471,12 @@ Setting ~~~~~~~ Setting values in a categorical column (or `Series`) works as long as the value is included in the -`levels`: +`categories`: .. ipython:: python - cats = pd.Categorical(["a","a","a","a","a","a","a"], levels=["a","b"]) idx = pd.Index(["h","i","j","k","l","m","n"]) + cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) values = [1,1,1,1,1,1,1] df = pd.DataFrame({"cats":cats,"values":values}, index=idx) @@ -512,14 +487,14 @@ Setting values in a categorical column (or `Series`) works as long as the value except ValueError as e: print("ValueError: " + str(e)) -Setting values by assigning a `Categorical` will also check that the `levels` match: +Setting values by assigning categorical data will also check that the `categories` match: .. ipython:: python - df.loc["j":"k","cats"] = pd.Categorical(["a","a"], levels=["a","b"]) + df.loc["j":"k","cats"] = pd.Categorical(["a","a"], categories=["a","b"]) df try: - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) except ValueError as e: print("ValueError: " + str(e)) @@ -528,8 +503,8 @@ Assigning a `Categorical` to parts of a column of other types will use the value .. ipython:: python df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) - df.loc[1:2,"a"] = pd.Categorical(["b","b"], levels=["a","b"]) - df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) df df.dtypes @@ -538,29 +513,29 @@ Merging ~~~~~~~ You can concat two `DataFrames` containing categorical data together, -but the levels of these `Categoricals` need to be the same: +but the categories of these categoricals need to be the same: .. ipython:: python - cat = pd.Categorical(["a","b"], levels=["a","b"]) + cat = pd.Series(["a","b"], dtype="category") vals = [1,2] df = pd.DataFrame({"cats":cat, "vals":vals}) res = pd.concat([df,df]) res res.dtypes -In this case the levels are not the same and so an error is raised: +In this case the categories are not the same and so an error is raised: .. ipython:: python df_different = df.copy() - df_different["cats"].cat.levels = ["a","b","c"] + df_different["cats"].cat.categories = ["c","d"] try: pd.concat([df,df_different]) except ValueError as e: print("ValueError: " + str(e)) -The same applies to ``df.append(df)``. +The same applies to ``df.append(df_different)``. Getting Data In/Out ------------------- @@ -569,8 +544,8 @@ Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dt raise ``NotImplementedError``. Writing to a CSV file will convert the data, effectively removing any information about the -`Categorical` (levels and ordering). So if you read back the CSV file you have to convert the -relevant columns back to `category` and assign the right levels and level ordering. +categorical (categories and ordering). So if you read back the CSV file you have to convert the +relevant columns back to `category` and assign the right categories and categories ordering. .. ipython:: python :suppress: @@ -580,10 +555,10 @@ relevant columns back to `category` and assign the right levels and level orderi .. ipython:: python s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) - # rename the levels - s.cat.levels = ["very good", "good", "bad"] - # reorder the levels and add missing levels - s = s.cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + # rename the categories + s.cat.categories = ["very good", "good", "bad"] + # reorder the categories and add missing categories + s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) csv = StringIO() df.to_csv(csv) @@ -592,7 +567,8 @@ relevant columns back to `category` and assign the right levels and level orderi df2["cats"] # Redo the category df2["cats"] = df2["cats"].astype("category") - df2["cats"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"], inplace=True) + df2["cats"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"], + inplace=True) df2.dtypes df2["cats"] @@ -604,46 +580,88 @@ pandas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section ` -There are two ways a `np.nan` can be represented in `Categorical`: either the value is not -available ("missing value") or `np.nan` is a valid level. +There are two ways a `np.nan` can be represented in categorical data: either the value is not +available ("missing value") or `np.nan` is a valid category. .. ipython:: python - s = pd.Series(pd.Categorical(["a","b",np.nan,"a"])) + s = pd.Series(["a","b",np.nan,"a"], dtype="category") + # only two categories s - # only two levels - s.cat.levels - s2 = pd.Series(pd.Categorical(["a","b","c","a"])) - s2.cat.levels = [1,2,np.nan] + s2 = pd.Series(["a","b","c","a"], dtype="category") + s2.cat.categories = [1,2,np.nan] + # three categories, np.nan included s2 - # three levels, np.nan included - # Note: as int arrays can't hold NaN the levels were converted to object - s2.cat.levels + +.. note:: + As integer `Series` can't include NaN, the categories were converted to `object`. .. note:: Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as - `np.nan` levels into account: + `np.nan` categories into account: .. ipython:: python - c = pd.Categorical(["a","b",np.nan]) - c.levels = ["a","b",np.nan] - # will be inserted as a NA level: + c = pd.Series(["a","b",np.nan], dtype="category") + c.cat.set_categories(["a","b",np.nan], inplace=True) + # will be inserted as a NA category: c[0] = np.nan s = pd.Series(c) s pd.isnull(s) s.fillna("a") +Differences to R's `factor` +--------------------------- + +The following differences to R's factor functions can be observed: + +* R's `levels` are named `categories` +* R's `levels` are always of type string, while `categories` in pandas can be of any dtype. +* New categorical data is automatically ordered if the passed in values are sortable or a + `categories` argument is supplied. This is a difference to R's `factors`, which are unordered + unless explicitly told to be ordered (``ordered=TRUE``). +* It's not possible to specify labels at creation time. Use ``s.cat.rename_categories(new_labels)`` + afterwards. +* In contrast to R's `factor` function, using categorical data as the sole input to create a + new categorical series will *not* remove unused categories but create a new categorical series + which is equal to the passed in one! + Gotchas ------- +Old style constructor usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In earlier versions than pandas 0.15, a `Categorical` could be constructed by passing in precomputed +`codes` (called then `labels`) instead of values with categories. The `codes` were interpreted as +pointers to the categories with `-1` as `NaN`. This type of constructor useage is replaced by +the special constructor :func:`Categorical.from_codes`. + +Unfortunately, in some special cases, using code which assumes the old style constructor usage +will work with the current pandas version, resulting in subtle bugs: + +.. code-block:: python + + >>> cat = pd.Categorical([1,2], [1,2,3]) + >>> # old version + >>> cat.get_values() + array([2, 3], dtype=int64) + >>> # new version + >>> cat.get_values() + array([1, 2], dtype=int64) + +.. warning:: + If you used `Categoricals` with older versions of pandas, please audit your code before + upgrading and change your code to use the :func:`~pandas.Categorical.from_codes` + constructor. + `Categorical` is not a `numpy` array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Currently, `Categorical` and the corresponding ``category`` `Series` is implemented as a python -object and not as a low level `numpy` array dtype. This leads to some problems. +Currently, categorical data and the underlying `Categorical` is implemented as a python +object and not as a low-level `numpy` array dtype. This leads to some problems. `numpy` itself doesn't know about the new `dtype`: @@ -668,7 +686,7 @@ Dtype comparisons work: np.str_ == dtype Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` -are not numeric data (even in the case that ``.levels`` is numeric). +are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python @@ -682,6 +700,42 @@ are not numeric data (even in the case that ``.levels`` is numeric). .. note:: If such a function works, please file a bug at https://github.com/pydata/pandas! +dtype in apply +~~~~~~~~~~~~~~ + +Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get +a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a +basic type) and applying along columns will also convert to object. + +.. ipython:: python + + df = pd.DataFrame({"a":[1,2,3,4], + "b":["a","b","c","d"], + "cats":pd.Categorical([1,2,3,2])}) + df.apply(lambda row: type(row["cats"]), axis=1) + df.apply(lambda col: col.dtype, axis=0) + +No categorical index +~~~~~~~~~~~~~~~~~~~~ + +There is currently no index of type ``category``, so setting the index to categorical column will +convert the categorical data to a "normal" dtype first and therefore remove any custom +ordering of the categories: + +.. ipython:: python + + cats = pd.Categorical([1,2,3,4], categories=[4,2,3,1]) + strings = ["a","b","c","d"] + values = [4,2,3,1] + df = pd.DataFrame({"strings":strings, "values":values}, index=cats) + df.index + # This should sort by categories but does not as there is no CategoricalIndex! + df.sort_index() + +.. note:: + This could change if a `CategoricalIndex` is implemented (see + https://github.com/pydata/pandas/issues/7629) + Side effects ~~~~~~~~~~~~ @@ -691,114 +745,31 @@ means that changes to the `Series` will in most cases change the original `Categ .. ipython:: python - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) s = pd.Series(cat, name="cat") cat s.iloc[0:2] = 10 cat df = pd.DataFrame(s) - df["cat"].cat.levels = [1,2,3,4,5] + df["cat"].cat.categories = [1,2,3,4,5] cat -Use ``copy=True`` to prevent such a behaviour: +Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categoricals`: .. ipython:: python - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) s = pd.Series(cat, name="cat", copy=True) cat s.iloc[0:2] = 10 cat .. note:: - This also happens in some cases when you supply a `numpy` array instea dof a `Categorical`: - using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, but using + This also happens in some cases when you supply a `numpy` array instead of a `Categorical`: + using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, while using a string array (e.g. ``np.array(["a","b","c","a"])``) will not. -Danger of confusion -~~~~~~~~~~~~~~~~~~~ - -Both `Series` and `Categorical` have a method ``.reorder_levels()`` but for different things. For -Series of type ``category`` this means that there is some danger to confuse both methods. - -.. ipython:: python - - s = pd.Series(pd.Categorical([1,2,3,4])) - print(s.cat.levels) - # wrong and raises an error: - try: - s.reorder_levels([4,3,2,1]) - except Exception as e: - print("Exception: " + str(e)) - # right - s = s.cat.reorder_levels([4,3,2,1]) - print(s.cat.levels) - -See also the API documentation for :func:`pandas.Series.reorder_levels` and -:func:`pandas.Categorical.reorder_levels` - -Old style constructor usage -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -I earlier versions, a `Categorical` could be constructed by passing in precomputed `codes` -(called then `labels`) instead of values with levels. The `codes` are interpreted as pointers -to the levels with `-1` as `NaN`. - -.. ipython:: python - :okwarning: - - # This raises a FutureWarning: - cat = pd.Categorical([1,2], levels=[1,2,3], compat=True) - cat.get_values() - -In the default case (``compat=False``) the first argument is interpreted as values. - -.. ipython:: python - - cat = pd.Categorical([1,2], levels=[1,2,3], compat=False) - cat.get_values() - -.. warning:: - Using Categorical with precomputed codes and levels is deprecated and a `FutureWarning` - is raised. Please change your code to use the :func:`~pandas.Categorical.from_codes` - constructor instead of adding ``compat=False``. - -No categorical index -~~~~~~~~~~~~~~~~~~~~ - -There is currently no index of type ``category``, so setting the index to a `Categorical` will -convert the `Categorical` to a normal `numpy` array first and therefore remove any custom -ordering of the levels: - -.. ipython:: python - - cats = pd.Categorical([1,2,3,4], levels=[4,2,3,1]) - strings = ["a","b","c","d"] - values = [4,2,3,1] - df = pd.DataFrame({"strings":strings, "values":values}, index=cats) - df.index - # This should sort by levels but does not as there is no CategoricalIndex! - df.sort_index() - -.. note:: - This could change if a `CategoricalIndex` is implemented (see - https://github.com/pydata/pandas/issues/7629) - -dtype in apply -~~~~~~~~~~~~~~ - -Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get -a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a -basic type) and applying along columns will also convert to object. - -.. ipython:: python - - df = pd.DataFrame({"a":[1,2,3,4], "b":["a","b","c","d"], "cats":pd.Categorical([1,2,3,2])}) - df.apply(lambda row: type(row["cats"]), axis=1) - df.apply(lambda col: col.dtype, axis=0) - - Future compatibility ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 415208b9db2a7..d71775772eba4 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -570,11 +570,11 @@ For full docs, see the :ref:`Categorical introduction ` and the # Alternative: df["grade"] = df["raw_grade"].astype("category") df["grade"] - # Rename the levels - df["grade"].cat.levels = ["very good", "good", "very bad"] + # Rename the categories + df["grade"].cat.categories = ["very good", "good", "very bad"] - # Reorder the levels and simultaneously add the missing levels - df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + # Reorder the categories and simultaneously add the missing categories + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df["grade"] df.sort("grade") df.groupby("grade").size() diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6c6ebbc786f26..9ee0018500b00 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -85,6 +85,8 @@ def _maybe_to_categorical(array): The assigned value has to be a list-like object. All items must be unique and the number of items in the new categories must be the same as the number of items in the old categories. +Assigning to `categories` is a inplace operation! + Raises ------ ValueError @@ -1332,13 +1334,20 @@ class CategoricalAccessor(PandasDelegate): """ Accessor object for categorical properties of the Series values. + Be aware that assigning to `categories` is a inplace operation, while all methods return + new categorical data per default (but can be called with `inplace=True`). + Examples -------- >>> s.cat.categories >>> s.cat.categories = list('abc') + >>> s.cat.rename_categories(list('cab')) >>> s.cat.reorder_categories(list('cab')) + >>> s.cat.add_categories(['d','e']) + >>> s.cat.remove_categories(['d']) + >>> s.cat.remove_unused_categories() + >>> s.cat.set_categories(list('abcde')) - Allows accessing to specific getter and access methods """ def __init__(self, values, index):