In [1]:
import pandas as pd
In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
In [3]:
print('train shape:', train.shape)
print('test shape:', test.shape)
train shape: (88181, 35) test shape: (37792, 34)
In [4]:
train
Out[4]:
id | duration | protocol_type | service | flag | src_bytes | dst_bytes | land | wrong_fragment | urgent | ... | serror_rate | srv_serror_rate | rerror_rate | srv_rerror_rate | same_srv_rate | diff_srv_rate | srv_diff_host_rate | dst_host_count | dst_host_srv_count | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18965 | 0.0 | tcp | http | SF | 319.0 | 334.0 | 0 | 0.0 | 0.0 | ... | NaN | 0.000000 | 0.0 | 0.000000 | 0.932313 | 0.000000 | 0.085849 | 150.0 | 248.0 | normal |
1 | 75483 | 0.0 | udp | private | SF | NaN | 19.0 | 0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.140020 | 0.186396 | 0.000000 | 287.0 | NaN | attack |
2 | 112534 | 0.0 | udp | domain_u | SF | 31.0 | 87.0 | 0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.947003 | 0.000000 | 0.000000 | 244.0 | 250.0 | normal |
3 | 5811 | 0.0 | tcp | Z39_50 | S0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | ... | 0.904851 | 1.036665 | 0.0 | 0.000000 | 0.074540 | 0.050226 | 0.000000 | 282.0 | 18.0 | attack |
4 | 33469 | NaN | tcp | private | S0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | ... | 0.955840 | 0.931652 | 0.0 | 0.000000 | 0.028668 | 0.059607 | 0.000000 | 264.0 | 6.0 | attack |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
88176 | 93942 | 0.0 | tcp | private | S0 | NaN | 0.0 | 0 | 0.0 | 0.0 | ... | 0.938058 | 0.957549 | 0.0 | 0.000000 | 0.059318 | 0.054229 | 0.000000 | 233.0 | 13.0 | attack |
88177 | 22377 | 0.0 | tcp | http | SF | 281.0 | NaN | 0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.905550 | 0.000000 | 0.000000 | 24.0 | 253.0 | normal |
88178 | 43498 | 0.0 | tcp | ftp_data | SF | 3345.0 | 0.0 | 0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.947099 | 0.000000 | 0.072131 | 250.0 | NaN | normal |
88179 | 103546 | 0.0 | tcp | http | SF | 217.0 | 295.0 | 0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.0 | 0.107418 | 0.944223 | 0.000000 | 0.420514 | 227.0 | 240.0 | normal |
88180 | 77217 | 0.0 | udp | domain_u | SF | 48.0 | NaN | 0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.895179 | 0.000000 | 0.242852 | 227.0 | 260.0 | normal |
88181 rows × 35 columns
In [5]:
test
Out[5]:
id | duration | protocol_type | service | flag | src_bytes | dst_bytes | land | wrong_fragment | urgent | ... | srv_count | serror_rate | srv_serror_rate | rerror_rate | srv_rerror_rate | same_srv_rate | diff_srv_rate | srv_diff_host_rate | dst_host_count | dst_host_srv_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8217 | 0.00000 | udp | domain_u | SF | 30.0 | NaN | 0 | 0.0 | 0.0 | ... | 183.0 | 0.000000 | 0.0 | NaN | 0.000000 | 1.040264 | 0.000000 | 0.009590 | 254.0 | 230.0 |
1 | 71759 | 0.00000 | tcp | http | SF | 286.0 | 19905.0 | 0 | 0.0 | 0.0 | ... | 11.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.898254 | 0.000000 | 0.239601 | 283.0 | 266.0 |
2 | 41181 | 0.00000 | tcp | http | SF | 379.0 | 1984.0 | 0 | 0.0 | 0.0 | ... | 24.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.956086 | 0.000000 | 0.067805 | 254.0 | NaN |
3 | 34327 | 0.00000 | tcp | http | SF | 309.0 | NaN | 0 | 0.0 | 0.0 | ... | 2.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.983514 | 0.000000 | 0.000000 | 32.0 | 253.0 |
4 | 110640 | 0.00000 | tcp | http | SF | 181.0 | 1507.0 | 0 | 0.0 | 0.0 | ... | 5.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.942385 | 0.000000 | 0.000000 | 255.0 | 244.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37787 | 121411 | 0.00000 | tcp | private | REJ | 0.0 | 0.0 | 0 | 0.0 | 0.0 | ... | 16.0 | 0.000000 | 0.0 | 0.882772 | 0.973381 | 0.075206 | 0.054807 | 0.000000 | 245.0 | 15.0 |
37788 | 91297 | 0.00000 | tcp | other | REJ | 0.0 | 0.0 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.207923 | 0.0 | 0.724198 | 0.967533 | 0.000000 | 0.939551 | 0.000000 | 269.0 | 0.0 |
37789 | 26750 | 0.00000 | udp | domain_u | SF | 58.0 | NaN | 0 | 0.0 | 0.0 | ... | 303.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.909269 | 0.000000 | 0.009302 | 152.0 | 117.0 |
37790 | 103924 | 90.20205 | tcp | http | SF | 409.0 | 203.0 | 0 | 0.0 | 0.0 | ... | 4.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.978639 | 0.000000 | 0.683759 | 47.0 | 250.0 |
37791 | 20580 | 0.00000 | udp | domain_u | SF | 65.0 | 58.0 | 0 | 0.0 | 0.0 | ... | 148.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.986812 | 0.000000 | 0.019408 | 246.0 | 210.0 |
37792 rows × 34 columns
2. 欠損値を確認する¶
欠損値の有無、欠損値の数を確認します。
In [6]:
train.isna().sum()
Out[6]:
id 0 duration 1230 protocol_type 0 service 0 flag 0 src_bytes 22003 dst_bytes 21885 land 0 wrong_fragment 337 urgent 84 hot 1953 num_failed_logins 23 logged_in 0 num_compromised 354 root_shell 6 su_attempted 0 num_root 78 num_file_creations 65 num_shells 2 num_access_files 12 num_outbound_cmds 0 is_host_login 0 is_guest_login 0 count 7094 srv_count 1752 serror_rate 799 srv_serror_rate 577 rerror_rate 975 srv_rerror_rate 193 same_srv_rate 1263 diff_srv_rate 472 srv_diff_host_rate 3 dst_host_count 3701 dst_host_srv_count 6281 class 0 dtype: int64
In [7]:
test.isna().sum()
Out[7]:
id 0 duration 505 protocol_type 0 service 0 flag 0 src_bytes 9497 dst_bytes 9421 land 0 wrong_fragment 128 urgent 29 hot 921 num_failed_logins 14 logged_in 0 num_compromised 158 root_shell 4 su_attempted 0 num_root 34 num_file_creations 25 num_shells 0 num_access_files 5 num_outbound_cmds 0 is_host_login 0 is_guest_login 0 count 3141 srv_count 765 serror_rate 330 srv_serror_rate 273 rerror_rate 407 srv_rerror_rate 79 same_srv_rate 539 diff_srv_rate 212 srv_diff_host_rate 3 dst_host_count 1583 dst_host_srv_count 2707 dtype: int64
3. 各変数の分布を確認する¶
各変数がどのような分布をしているか調べます。 また、各変数の組について散布図を書きます。
In [8]:
all_df = pd.concat([train, test], axis=0, ignore_index=True)
all_df['class'] = all_df['class'].fillna('test')
In [9]:
numeric_columns = all_df.columns[all_df.dtypes == float].union(['class'])
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
In [11]:
for col in numeric_columns:
sns.displot(data=all_df.sample(10000), x=col, hue='class', legend=col, log_scale=True)
plt.show()
In [12]:
import seaborn as sns
sns.pairplot(all_df[numeric_columns].sample(1000), hue='class')
Out[12]:
<seaborn.axisgrid.PairGrid at 0x163845460>
4. 相関係数の確認¶
In [13]:
corr = all_df[numeric_columns.drop('class')].corr()
sns.heatmap(corr, linewidth=.2, annot=True, annot_kws={"size": 5}, cmap='YlGnBu', fmt='.2f')
Out[13]:
<Axes: >
In [ ]: