From 3c126f564e629bbce7cf696a1377576b6977bef9 Mon Sep 17 00:00:00 2001 From: Nathan Ford Date: Mon, 7 Aug 2017 07:56:53 -0500 Subject: [PATCH] DOC: added string processing comparison with SAS (#16497) --- doc/source/comparison_with_sas.rst | 140 +++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index 33a347de0bf5b9..1f2424d8a22f33 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -357,6 +357,146 @@ takes a list of columns to sort by. tips = tips.sort_values(['sex', 'total_bill']) tips.head() + +String Processing +----------------- + +Length +~~~~~~ + +SAS determines the length of a character string with the +`LENGTHN `__ +and `LENGTHC `__ +functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailing blanks. + +.. code-block:: none + + data _null_; + set tips; + put(LENGTHN(time)); + put(LENGTHC(time)); + run; + +Python determines the length of a character string with the ``len`` function. +``len`` includes trailing blanks. Use ``len`` and ``rstrip`` to exclude +trailing blanks. + +.. ipython:: python + + tips['time'].str.len().head() + tips['time'].str.rstrip().str.len().head() + + +Find +~~~~ + +SAS determines the position of a character in a string with the +`FINDW `__ function. +``FINDW`` takes the string defined by the first argument and searches for the first position of the substring +you supply as the second argument. + +.. code-block:: none + + data _null_; + set tips; + put(FINDW(sex,'ale')); + run; + +Python determines the position of a character in a string with the +``find`` function. ``find`` searches for the first position of the +substring. If the substring is found, the function returns its +position. Keep in mind that Python indexes are zero-based and +the function will return -1 if it fails to find the substring. + +.. ipython:: python + + tips['sex'].str.find("ale").head() + + +Substring +~~~~~~~~~ + +SAS extracts a substring from a string based on its position with the +`SUBSTR `__ function. + +.. code-block:: none + + data _null_; + set tips; + put(substr(sex,1,1)); + run; + +With pandas you can use ``[]`` notation to extract a substring +from a string by position locations. Keep in mind that Python +indexes are zero-based. + +.. ipython:: python + + tips['sex'].str[0:1].head() + + +Scan +~~~~ + +The SAS `SCAN `__ +function returns the nth word from a string. The first argument is the string you want to parse and the +second argument specifies which word you want to extract. + +.. code-block:: none + + data firstlast; + input String $60.; + First_Name = scan(string, 1); + Last_Name = scan(string, -1); + datalines2; + John Smith; + Jane Cook; + ;;; + run; + +Python extracts a substring from a string based on its text +by using regular expressions. There are much more powerful +approaches, but this just shows a simple approach. + +.. ipython:: python + + firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) + firstlast['First_Name'] = firstlast['String'].str.split(" ", expand=True)[0] + firstlast['Last_Name'] = firstlast['String'].str.rsplit(" ", expand=True)[0] + firstlast + + +Upcase, Lowcase, and Propcase +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The SAS `UPCASE `__ +`LOWCASE `__ and +`PROPCASE `__ +functions change the case of the argument. + +.. code-block:: none + + data firstlast; + input String $60.; + string_up = UPCASE(string); + string_low = LOWCASE(string); + string_prop = PROPCASE(string); + datalines2; + John Smith; + Jane Cook; + ;;; + run; + +The equivalent Python functions are ``upper``, ``lower``, and ``title``. + +.. ipython:: python + + firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) + firstlast['string_up'] = firstlast['String'].str.upper() + firstlast['string_low'] = firstlast['String'].str.lower() + firstlast['string_prop'] = firstlast['String'].str.title() + firstlast + Merging -------