titannic knowledge

1.train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()   #绘制sex为x轴，survived平均值为y轴的柱状图（因为survived项为 
                                                                     bool则均值为概率）

2.fig, ax = plt.subplots(1, 2, figsize = (18, 8))
sns.violinplot("Pclass", "Age", hue="Survived", data=train_data, split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0, 110, 10))

sns.violinplot("Sex", "Age", hue="Survived", data=train_data, split=True, ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0, 110, 10))

plt.show()   #绘制多子图小提琴图，hue为分类依据

3.plt.figure(figsize=(12,5))
plt.subplot(121)
train_data['Age'].hist(bins=70)
plt.xlabel('Age')
plt.ylabel('Num')

plt.subplot(122)
train_data.boxplot(column='Age', showfliers=False)
plt.show()             #绘制直方图（bins代表箱数，如x轴为[50,54）的一个柱即为一个箱）和箱图


4.bins = [0, 12, 18, 65, 100]
train_data['Age_group'] = pd.cut(train_data['Age'], bins)   #用bins将数据分成多组，bins为list，每一组里的values在bins中该组两 
                                                             个数之间，默认包含右侧数；bins为int代表在输入数据最小到最大范围内分 
                                                             为bins个组，将values分到相应组。
by_age = train_data.groupby('Age_group')['Survived'].mean()    
>>>by_age 
Age_group
(0, 12]      0.506173
(12, 18]     0.466667
(18, 65]     0.364512
(65, 100]    0.125000
Name: Survived, dtype: float64

5.train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)  #将dataframe中str部分取出，按照匹配字来取

6.plt.figure(figsize=(10,5))
plt.subplot(121)
sibsp_df['Survived'].value_counts().plot.pie(labels=['No Survived', 'Survived'], autopct = '%1.1f%%')
plt.xlabel('sibsp')

plt.subplot(122)
no_sibsp_df['Survived'].value_counts().plot.pie(labels=['No Survived', 'Survived'], autopct = '%1.1f%%')
plt.xlabel('no_sibsp')

plt.show()     #绘制饼图，数据为dataframe中各value的数量，labels为图上各部分的标签

7.sns.countplot('Embarked', hue='Survived', data=train_data)    #计数图，x轴为dataframe中的'Embarked'项

8.train_data['CabinLetter'] = pd.factorize(train_data['CabinLetter'])[0]   # pd.factorize用数字代表dataframe中不同value，返回两个元素：
                                                                             化为数字后的array和包含不同value的array
9.train_data['Fare_bin'] = pd.qcut(train_data['Fare'], 5)     #将数值数据分成几5个bin，train_data['Fare_bin']的value是  几个区间值

10.x.mode()   #求dataframe中value最频繁的值，有两个则array中有两个

11.combined_train_test['Title'] = combined_train_test['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
                                 #通过re.compile（）.findall（x）[0]取出x字符串value中与正则表达式相匹配的内容。
12.title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))   #构建dict，(，)中前面是key,后面是value。   

13.combined_train_test['Fare'] = combined_train_test[['Fare']].fillna(combined_train_test.groupby('Pclass').transform(np.mean))
                        #dataframe.groupby('').transform(np.mean)，将np.mean应用到每个group中不同group的value填入不同均值，与.mean()
                        不同（它只能求每组的平均值的list而这里是求出一个dataframe），fillna()的参数可以是scalar, dict, Series, or DataFrame。
                                                             
14.fare_bin_dummies_df = pd.get_dummies(combined_train_test['Fare_bin_id']).rename(columns=lambda x: 'Fare_' + str(x))
                                                             #通过rename（）来重命名get_dummies后的各column
15.combined_train_test['Pclass_Fare_Category'] = combined_train_test.apply(pclass_fare_category, args=(
    Pclass1_mean_fare, Pclass2_mean_fare, Pclass3_mean_fare), axis=1)       #对combined_train_test应用函数，函数参数为该df以及args中的内容

16.>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])    #制定label表
LabelEncoder()
>>> list(le.classes_)       
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])        #将（）内根据label表求出索引值表
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))             #逆转变
['tokyo', 'tokyo', 'paris']      

17.Correlation = pd.DataFrame(combined_train_test[
    ['Embarked', 'Sex', 'Title', 'Name_length', 'Family_Size', 'Family_Size_Category','Fare', 'Fare_bin_id', 'Pclass', 
     'Pclass_Fare_Category', 'Age', 'Ticket_Letter', 'Cabin']])    #将想要观察特征相关性的特征列成df
colormap = plt.cm.viridis
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(Correlation.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
            #绘制热度图，corr（）为求相关系数矩阵，
18.scale_age_fare = preprocessing.StandardScaler().fit(combined_train_test[['Age','Fare', 'Name_length']])
combined_train_test[['Age','Fare', 'Name_length']] = scale_age_fare.transform(combined_train_test[['Age','Fare', 'Name_length']])
                                                                   #正则化，scale value