Code
import numpy as np
import pandas as pd
import zipfile
#importing the zipfile already saved in the other folder.
zf = zipfile.ZipFile("../2022-10-12-day-6-of-50daysofkaggle/titanic.zip")
train = pd.read_csv(zf.open("train.csv"))
test = pd.read_csv(zf.open("test.csv"))
#Selecting only the numerical columns
num_col = train.select_dtypes(include=np.number).columns.tolist()
#deslecting passenger ID and 'Survived'
del num_col[0:2] #.remove() can remove only 1 item. so for more than 1, use for loop
select_col = num_col
#remaining columns
str_col= ["Sex", "Embarked", "Survived"]
#Adding more elements into a list using `extend` and not `append`
select_col.extend(str_col)
train_eda= train[train.columns.intersection(select_col)]
train_eda| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q |
891 rows × 8 columns