1 What is the EDA?

  1. John_Tukey
  2. 본격적인 데이터 분석에 앞서 데이터에 있는 중요한 특성을 요약하고 시각화하는 분석 방법
  3. 즉, 데이터의 모든 변수들에 대하여
    • 관측치들의 패턴 탐색
    • 잘못된 자료들을 탐색
    • 연구 가설의 설정
    • 연구의 가정을 검토
    • 변수들간의 관계 파악

2 EDA의 유형 분류

3 Bank Marketing Data Set

4 EDA for Bank Marketing Data Set

bank <- read.csv("bank.csv", sep=";",header = T)
names(bank)
##  [1] "age"       "job"       "marital"   "education" "default"  
##  [6] "balance"   "housing"   "loan"      "contact"   "day"      
## [11] "month"     "duration"  "campaign"  "pdays"     "previous" 
## [16] "poutcome"  "y"
head(bank)
##   age         job marital education default balance housing loan  contact
## 1  30  unemployed married   primary      no    1787      no   no cellular
## 2  33    services married secondary      no    4789     yes  yes cellular
## 3  35  management  single  tertiary      no    1350     yes   no cellular
## 4  30  management married  tertiary      no    1476     yes  yes  unknown
## 5  59 blue-collar married secondary      no       0     yes   no  unknown
## 6  35  management  single  tertiary      no     747      no   no cellular
##   day month duration campaign pdays previous poutcome  y
## 1  19   oct       79        1    -1        0  unknown no
## 2  11   may      220        1   339        4  failure no
## 3  16   apr      185        1   330        1  failure no
## 4   3   jun      199        4    -1        0  unknown no
## 5   5   may      226        1    -1        0  unknown no
## 6  23   feb      141        2   176        3  failure no
dim(bank)
## [1] 4521   17

4.1 고객정보

  1. 나이(수치형)

    summary(bank$age)
    ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    ##   19.00   33.00   39.00   41.17   49.00   87.00
    boxplot(bank$age)
    library(ggplot2)

    ggplot(data=bank, aes(x="", y=age)) +
        geom_boxplot() 
  2. 직업

    table(bank$job)
    ## 
    ##        admin.   blue-collar  entrepreneur     housemaid    management 
    ##           478           946           168           112           969 
    ##       retired self-employed      services       student    technician 
    ##           230           183           417            84           768 
    ##    unemployed       unknown 
    ##           128            38
    par(mar=c(8,3,1,1))
    barplot(table(bank$job), las=2)

    dt <- as.data.frame(table(bank$job))
    ggplot(data=dt, aes(x=Var1, y=Freq)) +
        geom_bar(stat="identity") + coord_flip() #flip cordinates
  3. 결혼상태

    p <- table(bank$marital)
    addmargins(p)
    ## 
    ## divorced  married   single      Sum 
    ##      528     2797     1196     4521
    p2 <- xtabs(~job+marital, bank)
    kable(addmargins(p2))
    divorced married single Sum
    admin. 69 266 143 478
    blue-collar 79 693 174 946
    entrepreneur 16 132 20 168
    housemaid 13 84 15 112
    management 119 557 293 969
    retired 43 176 11 230
    self-employed 15 127 41 183
    services 62 236 119 417
    student 0 10 74 84
    technician 89 411 268 768
    unemployed 22 75 31 128
    unknown 1 30 7 38
    Sum 528 2797 1196 4521
    kable(addmargins(prop.table(p2, 1)*100, 2), digits = 2)
    divorced married single Sum
    admin. 14.44 55.65 29.92 100
    blue-collar 8.35 73.26 18.39 100
    entrepreneur 9.52 78.57 11.90 100
    housemaid 11.61 75.00 13.39 100
    management 12.28 57.48 30.24 100
    retired 18.70 76.52 4.78 100
    self-employed 8.20 69.40 22.40 100
    services 14.87 56.59 28.54 100
    student 0.00 11.90 88.10 100
    technician 11.59 53.52 34.90 100
    unemployed 17.19 58.59 24.22 100
    unknown 2.63 78.95 18.42 100
    ggplot(data=bank, aes(job)) +
          geom_bar(aes(fill=marital), width=0.7, position = "dodge") + coord_flip() #flip cordinates
  4. balance: 연평균잔고, in euros (numeric)

    summary(bank$balance)
    ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    ##   -3313      69     444    1423    1480   71188
    ggplot(bank, aes(age, balance)) + geom_point()

    ggplot(bank, aes(education, balance)) + geom_boxplot()

    ss <- aggregate(balance~education+marital, bank, mean)
    ggplot(data=ss, aes(fill=education, x=marital, y=balance)) +
          geom_bar(stat="identity", width=0.7, position = "dodge") 


5 과제 2