wget https://raw.githubusercontent.com/lakshya90/DataScience101/master/titanic.csv (curl -O for mac)
import pandas as pd
df = pd.read_csv('titanic.csv')
df.shape # Output : 891,12
df.head(5)
df.tail(5)
df.info() # Output : 891,12
df.describe()
df['Sex'].value_counts();
df['Survived'].value_counts();
df['Pclass'].value_counts()
d = df['PassengerId']; type(d) #<class ‘pandas.core.series.Series’>
df.iloc[654,:]
df.describe() #Check count of all features
df_a = df; df_a['Age'] = df_a['Age'].fill na(df_a['Age'].mean())
df_a['Age'].count() #891 from previous 714
df.drop(['PassengerId','Name','Ticket', 'Cabin'], axis=1)
pd.isnull(df['Cabin'])
df[df['Embarked'].isnull()]
df['Pclass'].value_counts(); s = df['Pclass'] < 2; s.value_counts()
df.sort_values('Age')
df.groupby('Sex')['Survived'].value_counts()
df.groupby('Sex').Survived.mean()
df.describe(include=['O'])
df['Age'].mean() #Mean of all values of the feature ‘Age’
df['Cabin'].count() #Count of all valid ‘Cabin’ values
df['Fare'].max() #Maximum fare paid for the ticket
df.to_csv('output.csv')