-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitannic knowledge
95 lines (75 loc) · 5.48 KB
/
titannic knowledge
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
1.train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar() #绘制sex为x轴,survived平均值为y轴的柱状图(因为survived项为
bool则均值为概率)
2.fig, ax = plt.subplots(1, 2, figsize = (18, 8))
sns.violinplot("Pclass", "Age", hue="Survived", data=train_data, split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0, 110, 10))
sns.violinplot("Sex", "Age", hue="Survived", data=train_data, split=True, ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0, 110, 10))
plt.show() #绘制多子图小提琴图,hue为分类依据
3.plt.figure(figsize=(12,5))
plt.subplot(121)
train_data['Age'].hist(bins=70)
plt.xlabel('Age')
plt.ylabel('Num')
plt.subplot(122)
train_data.boxplot(column='Age', showfliers=False)
plt.show() #绘制直方图(bins代表箱数,如x轴为[50,54)的一个柱即为一个箱)和箱图
4.bins = [0, 12, 18, 65, 100]
train_data['Age_group'] = pd.cut(train_data['Age'], bins) #用bins将数据分成多组,bins为list,每一组里的values在bins中该组两
个数之间,默认包含右侧数;bins为int代表在输入数据最小到最大范围内分
为bins个组,将values分到相应组。
by_age = train_data.groupby('Age_group')['Survived'].mean()
>>>by_age
Age_group
(0, 12] 0.506173
(12, 18] 0.466667
(18, 65] 0.364512
(65, 100] 0.125000
Name: Survived, dtype: float64
5.train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) #将dataframe中str部分取出,按照匹配字来取
6.plt.figure(figsize=(10,5))
plt.subplot(121)
sibsp_df['Survived'].value_counts().plot.pie(labels=['No Survived', 'Survived'], autopct = '%1.1f%%')
plt.xlabel('sibsp')
plt.subplot(122)
no_sibsp_df['Survived'].value_counts().plot.pie(labels=['No Survived', 'Survived'], autopct = '%1.1f%%')
plt.xlabel('no_sibsp')
plt.show() #绘制饼图,数据为dataframe中各value的数量,labels为图上各部分的标签
7.sns.countplot('Embarked', hue='Survived', data=train_data) #计数图,x轴为dataframe中的'Embarked'项
8.train_data['CabinLetter'] = pd.factorize(train_data['CabinLetter'])[0] # pd.factorize用数字代表dataframe中不同value,返回两个元素:
化为数字后的array和包含不同value的array
9.train_data['Fare_bin'] = pd.qcut(train_data['Fare'], 5) #将数值数据分成几5个bin,train_data['Fare_bin']的value是 几个区间值
10.x.mode() #求dataframe中value最频繁的值,有两个则array中有两个
11.combined_train_test['Title'] = combined_train_test['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
#通过re.compile().findall(x)[0]取出x字符串value中与正则表达式相匹配的内容。
12.title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer')) #构建dict,(,)中前面是key,后面是value。
13.combined_train_test['Fare'] = combined_train_test[['Fare']].fillna(combined_train_test.groupby('Pclass').transform(np.mean))
#dataframe.groupby('').transform(np.mean),将np.mean应用到每个group中不同group的value填入不同均值,与.mean()
不同(它只能求每组的平均值的list而这里是求出一个dataframe),fillna()的参数可以是scalar, dict, Series, or DataFrame。
14.fare_bin_dummies_df = pd.get_dummies(combined_train_test['Fare_bin_id']).rename(columns=lambda x: 'Fare_' + str(x))
#通过rename()来重命名get_dummies后的各column
15.combined_train_test['Pclass_Fare_Category'] = combined_train_test.apply(pclass_fare_category, args=(
Pclass1_mean_fare, Pclass2_mean_fare, Pclass3_mean_fare), axis=1) #对combined_train_test应用函数,函数参数为该df以及args中的内容
16.>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) #制定label表
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"]) #将()内根据label表求出索引值表
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1])) #逆转变
['tokyo', 'tokyo', 'paris']
17.Correlation = pd.DataFrame(combined_train_test[
['Embarked', 'Sex', 'Title', 'Name_length', 'Family_Size', 'Family_Size_Category','Fare', 'Fare_bin_id', 'Pclass',
'Pclass_Fare_Category', 'Age', 'Ticket_Letter', 'Cabin']]) #将想要观察特征相关性的特征列成df
colormap = plt.cm.viridis
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(Correlation.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
#绘制热度图,corr()为求相关系数矩阵,
18.scale_age_fare = preprocessing.StandardScaler().fit(combined_train_test[['Age','Fare', 'Name_length']])
combined_train_test[['Age','Fare', 'Name_length']] = scale_age_fare.transform(combined_train_test[['Age','Fare', 'Name_length']])
#正则化,scale value