import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print('train shape:', train.shape)
print('test shape:', test.shape)

train shape: (88181, 35)
test shape: (37792, 34)

train

test

train.isna().sum()

id                        0
duration               1230
protocol_type             0
service                   0
flag                      0
src_bytes             22003
dst_bytes             21885
land                      0
wrong_fragment          337
urgent                   84
hot                    1953
num_failed_logins        23
logged_in                 0
num_compromised         354
root_shell                6
su_attempted              0
num_root                 78
num_file_creations       65
num_shells                2
num_access_files         12
num_outbound_cmds         0
is_host_login             0
is_guest_login            0
count                  7094
srv_count              1752
serror_rate             799
srv_serror_rate         577
rerror_rate             975
srv_rerror_rate         193
same_srv_rate          1263
diff_srv_rate           472
srv_diff_host_rate        3
dst_host_count         3701
dst_host_srv_count     6281
class                     0
dtype: int64

test.isna().sum()

id                       0
duration               505
protocol_type            0
service                  0
flag                     0
src_bytes             9497
dst_bytes             9421
land                     0
wrong_fragment         128
urgent                  29
hot                    921
num_failed_logins       14
logged_in                0
num_compromised        158
root_shell               4
su_attempted             0
num_root                34
num_file_creations      25
num_shells               0
num_access_files         5
num_outbound_cmds        0
is_host_login            0
is_guest_login           0
count                 3141
srv_count              765
serror_rate            330
srv_serror_rate        273
rerror_rate            407
srv_rerror_rate         79
same_srv_rate          539
diff_srv_rate          212
srv_diff_host_rate       3
dst_host_count        1583
dst_host_srv_count    2707
dtype: int64

all_df = pd.concat([train, test], axis=0, ignore_index=True)
all_df['class'] = all_df['class'].fillna('test')

numeric_columns = all_df.columns[all_df.dtypes == float].union(['class'])

import matplotlib.pyplot as plt
import seaborn as sns

for col in numeric_columns:
    sns.displot(data=all_df.sample(10000), x=col, hue='class', legend=col, log_scale=True)
    plt.show()

import seaborn as sns
sns.pairplot(all_df[numeric_columns].sample(1000), hue='class')

<seaborn.axisgrid.PairGrid at 0x163845460>

corr = all_df[numeric_columns.drop('class')].corr()
sns.heatmap(corr, linewidth=.2, annot=True, annot_kws={"size": 5}, cmap='YlGnBu', fmt='.2f')

<Axes: >

	id	duration	protocol_type	service	flag	src_bytes	dst_bytes	land	wrong_fragment	urgent	...	serror_rate	srv_serror_rate	rerror_rate	srv_rerror_rate	same_srv_rate	diff_srv_rate	srv_diff_host_rate	dst_host_count	dst_host_srv_count	class
0	18965	0.0	tcp	http	SF	319.0	334.0	0	0.0	0.0	...	NaN	0.000000	0.0	0.000000	0.932313	0.000000	0.085849	150.0	248.0	normal
1	75483	0.0	udp	private	SF	NaN	19.0	0	0.0	0.0	...	0.000000	0.000000	0.0	0.000000	0.140020	0.186396	0.000000	287.0	NaN	attack
2	112534	0.0	udp	domain_u	SF	31.0	87.0	0	0.0	0.0	...	0.000000	0.000000	0.0	0.000000	0.947003	0.000000	0.000000	244.0	250.0	normal
3	5811	0.0	tcp	Z39_50	S0	0.0	0.0	0	0.0	0.0	...	0.904851	1.036665	0.0	0.000000	0.074540	0.050226	0.000000	282.0	18.0	attack
4	33469	NaN	tcp	private	S0	0.0	0.0	0	0.0	0.0	...	0.955840	0.931652	0.0	0.000000	0.028668	0.059607	0.000000	264.0	6.0	attack
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
88176	93942	0.0	tcp	private	S0	NaN	0.0	0	0.0	0.0	...	0.938058	0.957549	0.0	0.000000	0.059318	0.054229	0.000000	233.0	13.0	attack
88177	22377	0.0	tcp	http	SF	281.0	NaN	0	0.0	0.0	...	0.000000	0.000000	0.0	0.000000	0.905550	0.000000	0.000000	24.0	253.0	normal
88178	43498	0.0	tcp	ftp_data	SF	3345.0	0.0	0	0.0	0.0	...	0.000000	0.000000	0.0	0.000000	0.947099	0.000000	0.072131	250.0	NaN	normal
88179	103546	0.0	tcp	http	SF	217.0	295.0	0	0.0	0.0	...	0.000000	0.000000	0.0	0.107418	0.944223	0.000000	0.420514	227.0	240.0	normal
88180	77217	0.0	udp	domain_u	SF	48.0	NaN	0	0.0	0.0	...	0.000000	0.000000	0.0	0.000000	0.895179	0.000000	0.242852	227.0	260.0	normal

	id	duration	protocol_type	service	flag	src_bytes	dst_bytes	land	wrong_fragment	urgent	...	srv_count	serror_rate	srv_serror_rate	rerror_rate	srv_rerror_rate	same_srv_rate	diff_srv_rate	srv_diff_host_rate	dst_host_count	dst_host_srv_count
0	8217	0.00000	udp	domain_u	SF	30.0	NaN	0	0.0	0.0	...	183.0	0.000000	0.0	NaN	0.000000	1.040264	0.000000	0.009590	254.0	230.0
1	71759	0.00000	tcp	http	SF	286.0	19905.0	0	0.0	0.0	...	11.0	0.000000	0.0	0.000000	0.000000	0.898254	0.000000	0.239601	283.0	266.0
2	41181	0.00000	tcp	http	SF	379.0	1984.0	0	0.0	0.0	...	24.0	0.000000	0.0	0.000000	0.000000	0.956086	0.000000	0.067805	254.0	NaN
3	34327	0.00000	tcp	http	SF	309.0	NaN	0	0.0	0.0	...	2.0	0.000000	0.0	0.000000	0.000000	0.983514	0.000000	0.000000	32.0	253.0
4	110640	0.00000	tcp	http	SF	181.0	1507.0	0	0.0	0.0	...	5.0	0.000000	0.0	0.000000	0.000000	0.942385	0.000000	0.000000	255.0	244.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
37787	121411	0.00000	tcp	private	REJ	0.0	0.0	0	0.0	0.0	...	16.0	0.000000	0.0	0.882772	0.973381	0.075206	0.054807	0.000000	245.0	15.0
37788	91297	0.00000	tcp	other	REJ	0.0	0.0	0	0.0	0.0	...	0.0	0.207923	0.0	0.724198	0.967533	0.000000	0.939551	0.000000	269.0	0.0
37789	26750	0.00000	udp	domain_u	SF	58.0	NaN	0	0.0	0.0	...	303.0	0.000000	0.0	0.000000	0.000000	0.909269	0.000000	0.009302	152.0	117.0
37790	103924	90.20205	tcp	http	SF	409.0	203.0	0	0.0	0.0	...	4.0	0.000000	0.0	0.000000	0.000000	0.978639	0.000000	0.683759	47.0	250.0
37791	20580	0.00000	udp	domain_u	SF	65.0	58.0	0	0.0	0.0	...	148.0	0.000000	0.0	0.000000	0.000000	0.986812	0.000000	0.019408	246.0	210.0

EDA for traP コンペ 2024 # 00¶

前提¶

1.　データを確認する¶

2. 欠損値を確認する¶

3. 各変数の分布を確認する¶

4. 相関係数の確認¶

EDA for traP コンペ 2024 # 00¶

前提¶

1. データを確認する¶

2. 欠損値を確認する¶

3. 各変数の分布を確認する¶

4. 相関係数の確認¶

1.　データを確認する¶