WineQuality 분석

# -------------------------------------------------------------------------------- #
#
#                           ---- WineQuality ----
#
# -------------------------------------------------------------------------------- #

# 기간 : 2019-7-19 ~ 2019-7-26
# 참가인원 : 김석준, 김영인, 오창민, 이현영
# 김석준 : 데이터 전처리, 군집분석, SOM, 주성분분석, 인공신경망
# 김영인 : 의사결정나무
# 오창민 : 나이브 베이즈 분류기
# 이현영 : SVM

# install.packages("tidyverse")
# install.packages("rJava")
# install.packages("DBI")
# install.packages("RMySQL")
# install.packages("rpart")
# install.packages("e1071")
# install.packages("NeuralNetTools")
# install.packages("neuralnet")
# install.packages("caret")
# install.packages("kohonen")
# install.packages("klaR")
library(kohonen);library(nnet);library(caret)
library(NeuralNetTools);library(e1071);library(rpart)
library(klaR);library(ggplot2);library(dplyr);library(readr)

redwine <- read_delim(file="H:\\R데이터분석\\데이터\\와인\\winequality-red.csv",delim=";")
# 레드와인
whitewine <- read_delim(file="H:\\R데이터분석\\데이터\\와인\\winequality-white.csv",delim=";")
# 화이트와인

# readr의 read_delim()는 read.table()와 비슷한 기능을 한다.

# -------------------------------------------------------------------------------- #
#
#                         ---- 1. 데이터 탐색(수치) ----
#
# -------------------------------------------------------------------------------- #
str(redwine) # 레드와인
str(whitewine) # 화이트와인
# ---------------- #
# fixed acidity - 고정산도 -> fixed
# volatile acidity - 휘발성 산도 -> volatile
# citric acid - 구연산 -> citric
# residual sugar - 잔류 설탕 -> sugar
# chlorides - 염화물 -> chlorides
# free sulfur dioxide - 자유 이산화황 -> fsd
# total sulfur dioxide - 총 이산화황 -> tsd
# density - 밀도 -> density
# pH - 산도 - >ph
# sulphates - 황산염 -> sulphates
# alcohol - 알코올 -> alcohol
# Output variable (based on sensory data):
# quality (score between 0 and 10) - 와인의 품질(종속변수) -> quality
# ---------------- #
# 변수명의 변경
# quality(와인품질)은 사람이 매긴 점수이므로 factor로 변환한다.
# quality는 0에서 10까지의 값을 가진다.
# 결측값은 존재하지 않는다.

# -- 이름 변경 -- #
name <- c("fixed", "volatile", "citric", "sugar", "chlorides", "fsd", "tsd",
          "density", "ph", "sulphates", "alcohol", "quality") # 열의 이름
names(redwine) <- name
names(whitewine) <- name

# -- quality factor로 변경 -- #
redwine <- redwine %>% mutate(quality = factor(quality))
whitewine <- whitewine %>% mutate(quality = factor(quality))

# -- 기초통계량 -- #
summary(redwine)
summary(whitewine)
# 두 데이터 프레임을 하나로 만들어 분석한다.
# white와인에서 높은 점수를 주는 속성과 레드와인에서 높은 점수를 주는 속성이
# 무엇인지 확인하고 점수를 예측하는 모형을 만든다.
# 또 점수와 속성들로 어떤 와인인지 예측한다.

# -- 데이터 통합 -- #
wine <- rbind(redwine %>% mutate(type = "red"),
              whitewine %>% mutate(type = "white"))
# 두 데이터를 하나로 합치며 type으로 구분
wine <- wine %>% mutate(type = factor(type)) # type을 factor로 바꾼다.
name <- c(name,"type")

# -- 기초통계량(wine) -- #
str(wine)
summary(wine)

dbWriteTable(con, name = "wine", value = wine, overwrite = TRUE, temporary = FALSE)

rm(list = setdiff(ls(), c("con"))) # 필요없는 변수 제거
# -------------------------------------------------------------------------------- #
#
#                         ---- 1. 데이터 탐색(그림) ----
#
# -------------------------------------------------------------------------------- #
dbListTables(con)
wine <- tbl(con, "wine") # 데이터 베이스의 값을 불러온다.
wine <- collect(wine) %>%
  mutate(quality = factor(quality),
         type = factor(type)) %>%
  select(-row_names)

# -------- wine quality와 비교 --------
# ---- 1) 고정산도 ----
# -- 고정산도 -- #
ggplot(data = wine, aes(x=fixed, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("고정산도") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 고정산도 vs type -- #
ggplot(data = wine, aes(x=fixed, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("고정산도 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# white와인의 산도가 red와인의 산도보다 대부분 작다.

# -- 고정산도 vs quality -- #
ggplot(data = wine, aes(x = quality, y = fixed, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("고정산도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 고정산도 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = fixed, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("고정산도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 2) 휘발성 산도 ----
# -- 휘발성 산도 -- #
ggplot(data = wine, aes(x=volatile, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("휘발성 산도") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 휘발성 산도 vs type -- #
ggplot(data = wine, aes(x=volatile, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("휘발성 산도 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# white와인의 산도가 red와인의 산도보다 대부분 작다.

# -- 휘발성 산도 vs quality -- #
ggplot(data = wine, aes(x = quality, y = volatile, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("휘발성 산도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 휘발성 산도 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = volatile, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("휘발성 산도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 3) 구연산 ----
# -- 구연산 -- #
ggplot(data = wine, aes(x=citric, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("구연산") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 구연산 vs type -- #
ggplot(data = wine, aes(x=citric, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("구연산 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# white와인은 red와인보다 더 중앙으로 뭉쳐있다.

# -- 구연산 vs quality -- #
ggplot(data = wine, aes(x = quality, y = citric, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("구연산 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 구연산 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = citric, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("구연산 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 4) 잔류설탕 ----
# -- 잔류설탕 -- #
ggplot(data = wine, aes(x=sugar, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("잔류설탕") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 잔류설탕 vs type -- #
ggplot(data = wine, aes(x=sugar, y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("잔류설탕 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# 잔류설탕은 red와인이 white와인보다 더 적다.

# -- 잔류설탕 vs quality -- #
ggplot(data = wine, aes(x = quality, y = sugar, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("잔류설탕 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 잔류설탕 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = sugar, fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("잔류설탕 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 5) 염화물 ----
# -- 염화물 -- #
ggplot(data = wine, aes(x=chlorides , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("염화물") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 염화물 vs type -- #
ggplot(data = wine, aes(x=chlorides , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("염화물 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# 염화물은 red와인이 white와인보다 더 많이 있다.

# -- 염화물 vs quality -- #
ggplot(data = wine, aes(x = quality, y = chlorides , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("염화물 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 염화물 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = chlorides , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("염화물 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 6) 자유 이산화황 ----
# -- 자유 이산화황 -- #
ggplot(data = wine, aes(x=fsd , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("자유 이산화황") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 자유 이산화황 vs type -- #
ggplot(data = wine, aes(x=fsd , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("자유 이산화황 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# 자유 이산화황은 red wine과 white wine이 별 차이가 없다.

# -- 자유 이산화황 vs quality -- #
ggplot(data = wine, aes(x = quality, y = fsd , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("자유 이산화황 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 자유 이산화황 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = fsd , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("자유 이산화황 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 7) 총 이산화황 ----
# -- 총 이산화황 -- #
ggplot(data = wine, aes(x=tsd , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("총 이산화황") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 총 이산화황 vs type -- #
ggplot(data = wine, aes(x=tsd , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("총 이산화황 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# white wine이 red wine보다 대부분 이산화황이 더 많은 것을 알 수 있다.

# -- 총 이산화황 vs quality -- #
ggplot(data = wine, aes(x = quality, y = tsd , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("총 이산화황 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 총 이산화황 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = tsd , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("총 이산화황 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 8) 밀도 ----
# -- 밀도 -- #
ggplot(data = wine, aes(x=density , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("밀도") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 밀도 vs type -- #
ggplot(data = wine, aes(x=density , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("밀도 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# whie wine보다 red wine이 오른쪽으로 더 치우쳐 젔다.

# -- 밀도 vs quality -- #
ggplot(data = wine, aes(x = quality, y = density , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("밀도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# 밀도가 낯으면 좋은 품질일 수 도 있다.

# -- 밀도 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = density , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("밀도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 9) 산도 ----
# -- 산도 -- #
ggplot(data = wine, aes(x=ph , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("산도") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# 분토가 정규분포에 근사하는  것 같아 보인다.

# -- 산도 vs type -- #
ggplot(data = wine, aes(x=ph , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("산도 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# white wine의 분포가 정규분포처럼 보인다.

# -- 산도 vs quality -- #
ggplot(data = wine, aes(x = quality, y = ph , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("산도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 산도 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = ph , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("산도 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# ---- 10) 황산염 ----
# -- 황산염 -- #
ggplot(data = wine, aes(x=sulphates , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("황산염") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 황산염 vs type -- #
ggplot(data = wine, aes(x=sulphates , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("황산염 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 황산염 vs quality -- #
ggplot(data = wine, aes(x = quality, y = sulphates , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("황산염 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 황산염 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = sulphates , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("황산염 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# red wine에서 황산염이 높으면 좋은 품질일수 있을 것 같다.

# ---- 11) 알코올 ----
# -- 알코올 -- #
ggplot(data = wine, aes(x=alcohol , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  ggtitle("알코올") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 알코올 vs type -- #
ggplot(data = wine, aes(x=alcohol , y = ..density..)) +
  geom_histogram(bins = 20, colour = "#ffffff", fill = "#5daa5d")+
  geom_line(stat="density", colour= "blue")+
  facet_grid(type ~ .) +
  ggtitle("알코올 vs type") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))

# -- 알코올 vs quality -- #
ggplot(data = wine, aes(x = quality, y = alcohol , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  ggtitle("알코올 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# 알코올의 도수가 높으면 품질이 높다고 예측된다.

# -- 알코올 vs quality, type -- #
ggplot(data = wine, aes(x = quality, y = alcohol , fill = quality)) +
  geom_boxplot(outlier.colour = "red") +
  facet_grid(type ~ .)+
  ggtitle("알코올 vs quality") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size = 20, colour = "#112211"))
# red wine과 white wine에서도 같이 예측된다.

# ---- 12)각 독립변수들의 상관관계 분석 ----
pairs(wine %>% select(-c(type, quality)))
corr <- cor(wine %>% select(-c(type, quality)),method = "pearson")
abs(corr)>=0.5
# density-sugar
# fsd-tsd
# density-alcohol
# sd = fsd/tsd로 새로운 변수를 만들어 사용한다.
# density는 분석에서 제외한다.

# -------------------------------------------------------------------------------- #
#
#                         ---- 2. 데이터 전처리 ----
#
# -------------------------------------------------------------------------------- #

# -------- 1) 각 변수들의 정규화후 분포 분석 --------
wine <- wine %>%
  mutate(quality_g =ifelse(quality %in% c("1","2","3","4"),"하급",
                           ifelse(quality %in% c("5","6","7"), "중급","상급")) %>%
           factor(levels = c("하급","중급","상급")),
         sd = fsd/tsd) %>% # 총 이산화황에서 자유 이산화황의 비율
  select(-c(fsd,tsd,density))
# -- 상관계수 확인 -- #
wineCor <- cor(wine %>% select(-c(type, quality,quality_g)))
abs(wineCor) >= 0.4
pairs(wine %>% select(-c(type, quality,quality_g)))
# 산점도 행렬과 상관계수를 가지고 분석한 결과
# 각 독립변수들은 서로 독립이라고 가정할 수 있다.

# 훈련 데이터(70%)와 검사 데이터로 구분
set.seed(20190724)
train <- wine %>% group_by(type, quality_g) %>% sample_frac(0.7) %>% group_by()
test <- anti_join(wine, train)

# 정규화
train2 <- train %>% group_by(type) %>%
  mutate(fixed = scale(fixed),
          volatile = scale(volatile),
          citric = scale(citric),
          sugar = scale(sugar),
          chlorides = scale(chlorides),
          sd = scale(sd),
          ph = scale(ph),
          sulphates = scale(sulphates),
          alcohol = scale(alcohol)) %>%
  group_by()

test2 <- test %>% group_by(type) %>%
  mutate(fixed = scale(fixed),
         volatile = scale(volatile),
         citric = scale(citric),
         sugar = scale(sugar),
         chlorides = scale(chlorides),
         sd = scale(sd),
         ph = scale(ph),
         sulphates = scale(sulphates),
         alcohol = scale(alcohol)) %>%
  group_by()

base2 <- ggplot(data = train2)

# ---- quality ----
base2 + geom_bar(aes(x = quality), stat = "count",
                 fill = "green", col= "black") +
  ggtitle("품질") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_bar(aes(x = quality), stat="count",
                 fill = "green", col= "black") +
  ggtitle("type별 품질") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)
# 품질의 3,4,8,9의 관측치가 적다.
# 따라서 quality를 3개의 범주로 만든다.

# -- 고정산도 --
base2 + geom_histogram(aes(x = fixed, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=fixed),stat = "density", col="red", size = 1.5) +
  ggtitle("고정산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = fixed, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=fixed),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 고정산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=fixed, fill=quality_g)) +
  ggtitle("품질별 고정산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 휘발성 산도 --
base2 + geom_histogram(aes(x = volatile, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=volatile),stat = "density", col="red", size = 1.5) +
  ggtitle("휘발성 산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = volatile, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=volatile),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 휘발성 산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=volatile, fill=quality_g)) +
  ggtitle("품질별 휘발성 산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 구연산 --
base2 + geom_histogram(aes(x = citric, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=citric),stat = "density", col="red", size = 1.5) +
  ggtitle("구연산 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = citric, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=citric),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 구연산 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=citric, fill=quality_g)) +
  ggtitle("품질별 구연산 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 잔류 설탕 --
base2 + geom_histogram(aes(x = sugar, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=sugar),stat = "density", col="red", size = 1.5) +
  ggtitle("잔류 설탕 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = sugar, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=sugar),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 잔류 설탕 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=sugar, fill=quality_g)) +
  ggtitle("품질별 잔류 설탕 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 염화물 --
base2 + geom_histogram(aes(x = chlorides, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=chlorides),stat = "density", col="red", size = 1.5) +
  ggtitle("염화물 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = chlorides, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=chlorides),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 염화물 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=chlorides, fill=quality_g)) +
  ggtitle("품질별 염화물 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 이산화황 --
base2 + geom_histogram(aes(x = sd, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=sd),stat = "density", col="red", size = 1.5) +
  ggtitle("이산화황 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = sd, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=sd),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 자유 이산화황 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=sd, fill=quality_g)) +
  ggtitle("품질별 이산화황 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 산도 --
base2 + geom_histogram(aes(x = ph, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=ph),stat = "density", col="red", size = 1.5) +
  ggtitle("산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = ph, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=ph),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=ph, fill=quality_g)) +
  ggtitle("품질별 산도 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 황산염 --
base2 + geom_histogram(aes(x = sulphates, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=sulphates),stat = "density", col="red", size = 1.5) +
  ggtitle("황산염 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = sulphates, y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=sulphates),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 황산염 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=sulphates, fill=quality_g)) +
  ggtitle("품질별 황산염 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -- 알콜 --
base2 + geom_histogram(aes(x = alcohol , y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=alcohol ),stat = "density", col="red", size = 1.5) +
  ggtitle("알콜 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

base2 + geom_histogram(aes(x = alcohol , y = ..density..), bins = 30,
                       fill = "green", col= "black") +
  geom_line(aes(x=alcohol ),stat = "density", col="red", size = 1.5) +
  ggtitle("type별 알콜 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20)) +
  facet_grid(type ~ .)

base2 + geom_boxplot(aes(x = quality_g, y=alcohol , fill=quality_g)) +
  ggtitle("품질별 알콜 정규화") +
  theme(plot.title = element_text(hjust= 0.5, vjust = 1, size =20))

# -------- 2) 군집분석 --------
# 군집분석을 하여 어떠한 관측치들이 이상치인지 확인한다.
rm(list = setdiff(ls(), c("con","wine","train2","test2","train", "test")))
# 군집분석은 train2의 데이터를 이용하며
# fixed, volatile, citric, sugar, chlorides, fsd, tsd, density, ph,
# sulphates, alcohol을 이용하여 군집을 형성한다.

# -- k중심 군집분석 --
# 최소군집의 개수는 3개(상급,중급,하급)로 한다.
# 최대의 군집의 개수는 11개(품질의 개수)로 한다.
k_mean <- function(x,data){
  model <- kmeans(data,centers = x,iter.max = 500)
  result <- with(model,c("g_in"=tot.withinss/totss,
                         "g_out"=betweenss/totss))
  return(result)
}
set.seed(20190724)
result <- sapply(X = 2:40, FUN= k_mean, data= train2 %>%
                   select(-c(type,quality,quality_g)))
result <- as.data.frame(t(result))
# 군집내의 거리의 그래프
ggplot(result,
       aes(x=2:40,y=g_in)) +
  geom_point(colour = "blue", size = 5) +
  geom_line(colour = "red") +
  scale_x_continuous(breaks = 2:40) +
  ggtitle("군집내 거리") +
  theme(plot.title = element_text(hjust=0.5, vjust=1,
                                  size = 25, colour = "red"))

# 군집간의 거리의 그래프
ggplot(result,
       aes(x=2:40,y=g_out)) +
  geom_point(colour = "blue", size = 5) +
  geom_line(colour = "red") +
  scale_x_continuous(breaks = 2:40) +
  ggtitle("군집간 거리") +
  theme(plot.title = element_text(hjust=0.5, vjust=1,
                                  size = 25, colour = "red"))
# 군집의 개수는 35개가 적당해 보인다.
clust <- kmeans(train2 %>% select(-c(quality,type,quality_g)), centers = 35,
                iter.max = 200)
clust
with(clust, c(tot.withinss/totss, betweenss/totss))
# 군집내에서 quality가 여러종류 썪여 있으면 이상치로 간주하고 제거한다.
train2 %>% group_by(clust$cluster) %>% summarise("하급" = sum(quality_g=="하급"),
                                          "중급" = sum(quality_g=="중급"),
                                          "상급" = sum(quality_g=="상급")) %>%
  as.data.frame
# 중급이 너무많아 군집화가 의미가 없다고 판단, 군집화를 하지 않는다.

# ---- 자기조직화지도(SOM) ----
som_model <- som(train2 %>% select(-c(type,quality,quality_g)) %>% as.matrix)
plot(som_model)

names(som_model)
som_model$unit.classif

train2 %>% group_by(som_model$unit.classif) %>% summarise("하급" = sum(quality_g=="하급"),
                                           "중급" = sum(quality_g=="중급"),
                                           "상급" = sum(quality_g=="상급")) %>%
  as.data.frame
# kmeas 군집분석과 마찬가지로 모든 군집에서 중급의 값이 너무 많다.
# 따라서, SOM도 사용하지 않는다.

# ---- 주성분 분석 ----
prc <- prcomp(wine %>% select(-c(type, quality, quality_g)),
                scale = TRUE, retx = TRUE)
summary(prc)
# Proportion of Variance는 분산의 설명력이며 누적값이 0,7~0.9의 값을 선택한다.
#PC6까지 사용한다

prc$x # 주성분 점수 : 새로운 변수로 사용
prc$rotation
# 주성분 계수로 값의 절대값으로
# 새로운 변수가 어떠한 변수인지 이름을 붙일 때 사용한다.

newdata <- cbind(wine,prc$x) %>% tbl_df %>%
  select(type, quality_g, quality,PC1, PC2, PC3, PC4, PC5, PC6)
set.seed(20190724)
newTrain <- newdata %>% group_by(type,quality_g) %>%
  sample_frac(0.7) %>% group_by()
newTest <- anti_join(newdata,newTrain)
# 각 변수가 독립이라 가정하여 사용하지 않는다.

# -------------------------------------------------------------------------------- #
#
#                         ---- 2. 데이터 분석(quality 예측) ----
#
# -------------------------------------------------------------------------------- #
rm(list = setdiff(ls(),c("con","wine","train2","test2",
                         "train", "test","newdata","newTrain",
                         "newTest","select") ))

# ---- 1) MLP(train2 : quality예측) ----

# 다층 퍼셉드로은 정규화를 해야 정확한 값을 구할 수 있다.
# 따라서 train2를 사용한다.
# quality를 먼저 예측한후 quality_g로 바꾼어 예측율을 검사한다.
net <- function(x, data){
  nnet(quality ~ . -quality_g, data=data,
       size = x)
}

# set.seed(123456)
# model <- lapply(2:30,FUN = net, data=train2) # 정규화를 한 데이터
# names(model) <- 2:30

# 주성분 분석을 한 데이터
set.seed(123456)
model2 <- lapply(2:30, FUN = net, data=newTrain)
names(model2) <- 2:30

# quality를 quality_g로 변경하는 함수
lv <- function(x){
  ifelse(x %in% c("1","2","3","4"),"하급",
         ifelse(x %in% c("5","6","7"), "중급", "상급")) %>%
    factor(levels=c("하급","중급","상급"))
}

# pred <- lapply(model, FUN = function(x){
#   predict(x, test2, type = "class") %>% lv
# })

pred2 <- lapply(model2, FUN = function(x){
  predict(x, newTest, type = "class") %>% lv
})

# kappa <- sapply(pred, function(x){
#   confusionMatrix(x, test2$quality_g)$overall["Kappa"]
# })
kappa2 <- sapply(pred2, function(x){
  confusionMatrix(x, newTest$quality_g)$overall["Kappa"]
})

# plot(x = 2:30, y = kappa, type= "l")
# which.max(kappa) # size = 28일때 Kappa값이 가장 크다.
plot(x = 2:30, y = kappa2, type= "l")
which.max(kappa2) # size = 14일때 Kappa값이 가장 크다.

# result <- lapply(pred, function(x){
#   confusionMatrix(x, test2$quality_g)
# })
result2 <- lapply(pred2, function(x){
  confusionMatrix(x, newTest$quality_g)
})

# result$'28' # 주성분 분석한 것보다 더 좋게 나온다.
result2$'28'
# 기존의 변수로 분석을 하는데 하급과 상급의 데이터가 부족하여
# 제대로 분석을 하지 못한다고 판단된다. 따라서 새로운 변수가
# 추가되거나 새로운 데이터가 더 필요할 것으로 생각된다.\

# -- size = 29인 변수를 선택 --
model <- model2$"28"

summary(model) # 모형 요약

plotnet(model) # 시각화

# ------ SVM -------
# 모델링을 위해서 타겟변수만 factor처리 및 다른 변수는 모두 연속형이거나 순서형의 범주형 변수로 가정

# # kernel에 따른 조정인자 튜닝
# # 1.방사형 커널
result <- tune.svm(quality ~ . -quality_g, data = train2, gamma = 2^(-5:0), cost = 2^(0:4), kernel = "radial")
#
# # 2.서포트 벡터 분류기 (직선)
result1 <- tune.svm(quality ~ . -quality_g, data = train2, cost = 2^(0:4), kernel = "linear")
#
# # 3.다항식 커널 (곡선)
result2 <- tune.svm(quality ~ . -quality_g, data = train2, cost = 2^(0:4), degree = 2:4, kernel = "polynomia")
#  # cost 랑 gamma:범위를 지정한거임 범위 내에 모형들을 만들어줘서, random 으로 바꾸든가, 범위를 지정하는 동 해라. degree=차수

# 적정 cost,gamma값 확인
result$best.parameters # gamma = 1, cost = 4
result1$best.parameters # cost = 1
result2$best.parameters # degree = 4, cost = 8

# -- train1의 결과 --
# > result$best.parameters
# gamma cost
# 12     1    2
# > result1$best.parameters
# cost
# 1    1
# > result2$best.parameters
# degree cost
# 2      3    1

# SVM모델링 수행(kernel인자에 raial입력으로 방사커 널 수행) -gamma와 cost인자 설정
wine_svm <- svm(quality ~ . -quality_g, data = train2, gamma = 1, cost = 4, kernel = "radial")

# kernel인자에 linear입력으로 서포트 벡터 분류기 수행 - cost인자 설정
wine_svm1 <- svm(quality ~ .-quality_g, data = train, cost = 1, kernel = "linear")

# kernel인자에 polynomia입력으로 다차원 커널의 SVM수행 - cost, degree인자 설정
wine_svm2 <- svm(quality ~ .-quality_g, data = train, cost =8 , degree = 4, kernel = "polynomia")

# 결과확인
summary(wine_svm)
summary(wine_svm1)
summary(wine_svm2)

# 세 경우의 서포트 벡터 확인(몇 번째 관찰값이 서포트 벡터인지 확인)
wine_svm$index
wine_svm1$index
wine_svm2$index

# 세 경우의 정확도 측정(처음 분리했던 test데이터 이용)
wine_svm_predict <- predict(wine_svm, test) %>% lv
wine_svm1_predict1 <- predict(wine_svm1, test) %>% lv
wine_svm2_predict2 <- predict(wine_svm2, test) %>% lv

# 정확도를 표로 시각화하기
confusionMatrix(wine_svm_predict, test$quality_g)
confusionMatrix(wine_svm1_predict1, test$quality_g)
confusionMatrix(wine_svm2_predict2, test$quality_g) # 가장 kappa값이 높다.

# ---- 의사결정나무 ----
set.seed(20190724)
tree <- rpart(quality ~ ., data=newTrain %>% select(-quality_g),
              cp = -1, method = "class")

# -- 최대 성장 모형 --
plot(tree)
text(tree)

# -- cp --
index <- which.min(tree$cptable[,"xerror"])
cp <- tree$cptable[index,"CP"]

# -- 가지치기 --
tree <- prune(tree, cp = cp)

#-- 그림 --
plot(tree, margin = 0.01, compress = TRUE,uniform = TRUE)
text(tree, cex = 0.8)
tree

# -- 변수 중요도 --
barplot(sort(tree$variable.importance, decreasing = TRUE),
        main = "변수중요도", cex.main = 3)

# -- 예측 --
pred <- predict(tree, newTest, type = "class") %>% lv
confusionMatrix(pred,test$quality_g)

# ---- 나이브 베이즈 분류기 ----

select <- dplyr::select
model <- NaiveBayes(quality ~ ., train2 %>% select(-quality_g), fL= 1)
# 라플라스 추정량을 1로 둔다.
pred <- predict(model, test2, threshold = 0.1)$class %>% lv
confusionMatrix(pred,test2$quality_g)

k <- function(x){
  data <- train2 %>% select(-c("quality_g",x))
  m <- NaiveBayes(quality ~ ., data=data, fL= 1)
  # 라플라스 추정량을 1로 둔다.
  print(m$varnames)
  p <- predict(m, test2, threshold = 0.1)$class %>% lv
  confusionMatrix(p,test2$quality_g)

}

names(train2)
k("type") # Accuracy = 0.9016, Kappa = 0.1204
k("fixed") # Accuracy = 0.9023, Kappa = 0.1264
k("volatile") # Accuracy = 0.9009, Kappa = -0.0024
k("citric") # Accuracy = 0.9037, Kappa = 0.1387
k("sugar") # Accuracy = 0.9037, Kappa = 0.1522
k("chlorides") # Accuracy = 0.9132, Kappa = 0.1847
k("ph") # Accuracy = 0.8996, Kappa = 0.1255
k("sd") # Accuracy = 0.9023, Kappa = 0.1309
k("sulphates") # Accuracy = 0.905, Kappa = 0.1368
k("alcohol") # Accuracy = 0.8982, Kappa = 0.1404

k(c("chlorides","sugar")) # Accuracy = 0.9111, Kappa = 0.1702
k(c("chlorides","sugar","alcohol")) # Accuracy = 0.9111, Kappa = 0.1794
k(c("chlorides","sugar","alcohol","citric")) # Accuracy = 0.9125, Kappa = 0.1737
k(c("chlorides","sugar","alcohol","citric","sulphates")) # Accuracy = 0.9118, Kappa = 0.0.143
# volatile - 가장 영향이 큼
# chlorides - 가장 영향이 작음

'데이터 마이닝' 카테고리의 다른 글

모형의 평가방법( 예측모형) (0)	2019.08.29
Kaggle 대회 데이터 분석 (0)	2019.08.28
머신러닝 시작 (0)	2019.08.09
통계상담과제 (0)	2019.07.22
Pulse 분석 (0)	2019.07.22

betago의 머신러닝 수업기록

WineQuality 분석

'데이터 마이닝' 카테고리의 다른 글

티스토리툴바

WineQuality 분석

'데이터 마이닝' 카테고리의 다른 글

'데이터 마이닝' Related Articles

티스토리툴바