############################################################### #Laboratorio 7- MAGGIO 2025#################################### #Cluster Analysis ###################################################################### ## Metodi gerarchici ###################################################################### risk=read.csv("risk.csv") dim(risk) colMeans(risk) risk.dist <- dist(risk, method = "manhattan") risk.hcl <- hclust(risk.dist, method = "complete") risk.hcl plot(risk.hcl, main = "", labels = FALSE) c2 <- cutree(risk.hcl, h = 20) table(c2) c6 <- cutree(risk.hcl, k = 6) table(c6) c6.means <- aggregate(risk, list(Cluster = c6), mean) round(c6.means[, -1], 1) ### tutte le colonne tranne la prima, arrotondo a una cifra decimale #install.packages("flexclust") library("flexclust") barchart(risk.hcl, risk, k = 6) ###################################################################### ## K-medie ###################################################################### set.seed(1234) risk.km28 <- stepcclust(risk, k = 2:8, nrep = 10) plot(risk.km28) barchart(risk.km28[["2"]]) barchart(risk.km28[["6"]]) ################################################################### ###################################################################### ## Case study: Fast food data #Immaginiamo di essere una catena di Fast Food (ad esempio McDonald’s) e vogliamo sapere se esistono dei segmenti con diversa percezione dell’immagine del brand #Percezione del brand: su quali segmenti concentrare l’attenzione? Percezione positiva o percezione negativa? ###################################################################### ## STEP 1: esplorazione dei dati ###################################################################### mcdonalds<- read.csv("mcdonalds.csv", stringsAsFactors=T) mcdonalds<- mcdonalds[,-1] str(mcdonalds) names(mcdonalds) dim(mcdonalds) head(mcdonalds, 3) ###Matrice di variabili di segmentazione MD.x <- as.matrix(mcdonalds[, 1:11]) MD.x <- (MD.x == "Yes") + 0 head(MD.x) round(colMeans(MD.x), 2) ###Alcune analisi preliminari sulle variabili #guardo le modalità di VisitFrequency levels(mcdonalds$VisitFrequency) #riordino mcdonalds$VisitFrequency <-factor(mcdonalds$VisitFrequency, levels=c("Never","Once a year","Every three months","Once a month","Once a week", "More than once a week")) #controllo levels(mcdonalds$VisitFrequency) ###variabile Like levels(mcdonalds$Like) mcdonalds$Like<- factor(mcdonalds$Like, levels=levels(mcdonalds$Like)[c(10,4:1,9,5:8,11)]) levels(mcdonalds$Like) #creo variabile numerica per Like mcdonalds$Like.n <- as.numeric(mcdonalds$Like)-6 ##Centrata sullo 0 ###################################################################### ## STEP 2: estrazione dei segmenti ######Clustering gerarchico Mdist<- dist(MD.x, method="euclidean") MD.c4<- hclust(Mdist, method="complete") plot(MD.c4, main="", labels=F) #MD.c4<- hclust(Mdist, method="single") ####Uso l'indice di Gower library(cluster) ##La funzione daisy calcola la dissimilarità tra osservazioni contenute in un dataframe. Le variabili possono essere numeriche, ordinali o binarie. Mdist<- daisy(mcdonalds[, 1:11], metric="gower") MD.c4<- hclust(Mdist, method="complete") plot(MD.c4, main="", labels=F) ###################################################################### ## K-means clustering ###################################################################### library("flexclust") set.seed(1234) MD.km28 <- stepFlexclust(MD.x, 2:8, nrep = 10, verbose = T) plot(MD.km28, xlab = "number of segments") MD.k4 <- MD.km28[["4"]] barchart(MD.k4) ###################################################################### ## STEP 3: Descrizione dei segmenti ############################################################## k4 <- clusters(MD.k4) k4.Gender <- with(mcdonalds, table("Segment number" = k4, Gender)) k4.Gender mosaicplot(k4.Gender, main = "") #####Regressione logistica install.packages("car") install.packages("effects") library("car") library("effects") f1 <- I(k4 == 4) ~ Age + VisitFrequency + Gender model.1 <- glm(f1, data = mcdonalds, family = binomial()) model.1 summary(model.1) Anova(model.1) plot(allEffects(mod = model.1)) f2 <- I(k4 == 1) ~ Age + VisitFrequency + Gender model.2 <- glm(f2, data = mcdonalds, family = binomial()) model.2 summary(model.2) Anova(model.2) plot(allEffects(mod = model.2)) ######Regressione multinomiale library("nnet") set.seed(12) model.k4 <- multinom(k4 ~ Age + VisitFrequency + Gender, data = mcdonalds, trace = 0) model.k4 summary(model.k4) Anova(model.k4) plot(allEffects(mod = model.k4), layout = c(3, 2), rug=F) ###################################################################### ## STEP 5: Selezione del segmento target###################################################################### #trasformo le visitFrequency in numeriche (ipotizzo la stessa distanza tra diverse modalità della variabile - sbagliato, ma è un'approssimazione) #calcolo medie nei gruppi visit <- tapply(as.numeric(mcdonalds$VisitFrequency), k4, mean) visit like <- tapply(mcdonalds$Like.n, k4, mean) like female <- tapply((mcdonalds$Gender == "Female") + 0, k4, mean) female plot(visit, like, cex = 10 * female, xlim = c(2, 4.5), ylim = c(-3, 3.5)) ####################################################################### # Model-based clustering ###################################################################### ###################################################################### ## Modelli a misture finite ###################################################################### ## Misture di distribuzioni normali library(flexclust) #per i dati library(mclust) data("vacmot") #dataset da "flexclust" vacmet <- vacmotdesc[, c("Obligation", "NEP", "Vacation.Behaviour")] vacmet <- na.omit(vacmet) ###rimuoviamo dati mancanti pairs(vacmet, pch = 19, col = rgb(0, 0, 0, 0.2)) ####visualizziamo i dati vacmet.m18 <- Mclust(vacmet, G = 1:8) ####stimiamo tutti i 14 modelli (default). Il miglior modello rispetto al BIC viene selezionato. G = numero di segmenti da estrarre (da 1 a 8) ##Miglior modello VEE: ellissoidale, uguale forma e orientamento vacmet.m18.equal <- Mclust(vacmet, G = 1:8, modelNames = c("EEI", "EII", "EEE")) #####in alternativa, stimiamo solo modelli in cui le matrici di covarianza hanno uguale forma, dimensione e orientamento tra i segmenti. ###EEI: Diagonal, equal volume and shape ### EII: Spherical, equal volume ## EEE: Ellipsoidal, equal volume, shape, and orientation vacmet.m18 vacmet.m18.equal plot(vacmet.m18, what = "classification") plot(vacmet.m18.equal, what = "classification") plot(vacmet.m18, what = "uncertainty") plot(vacmet.m18, what = "BIC") plot(vacmet.m18.equal, what = "uncertainty") plot(vacmet.m18.equal, what = "BIC") ###In entrambi i modelli mistura selezionati, le matrici di covarianza hanno lo stesso orientamento e la stessa forma. Questo implica che la struttura di correlazione tra le variabili è la stessa nei vari segmenti. ### ######## Misture di regressioni############################# install.packages("flexmix") library("flexmix") vacmet[, c("Obligation", "NEP")] <- scale(vacmet[, c("Obligation", "NEP")]) #####standardizziamo le variabili Obligation e NEP (migliore visualizzazione dei risultati) ###stimiamo prima di tutto un modello di regressione lineare vacmet.lm <- lm(Vacation.Behaviour ~ Obligation + NEP, data = vacmet) summary(vacmet.lm) ####proviamo a stimare una mistura di modelli di regressione, per capire se la relazione può essere diversa tra i segmenti set.seed(1234) vacmet.m15 <- stepFlexmix(Vacation.Behaviour ~ ., data = vacmet, k = 1:5, nrep = 10, verbose = FALSE, control = list(iter.max = 1000)) ####selezioniamo il modello migliore secondo il BIC vacmet.m2 <- getModel(vacmet.m15) vacmet.m2 summary(refit(vacmet.m2)) ####la funzione refit ci serve per ottenere gli standard error delle stime. Refit usa la soluzione ottenuta con l'algoritmo EM e usa un ottimizzatore per ottenere l'informazione relativa allo standard error. par(mfrow = c(1, 2)) plot(Vacation.Behaviour ~ Obligation, data = vacmet, pch = 20, col = clusters(vacmet.m2)) abline(parameters(vacmet.m2)[1:2, 1], col = 1, lwd = 2) abline(parameters(vacmet.m2)[1:2, 2], col = 2, lwd = 2) plot(Vacation.Behaviour ~ NEP, data = vacmet, pch = 20, col = clusters(vacmet.m2)) abline(parameters(vacmet.m2)[c(1, 3), 1], col = 1, lwd = 2) abline(parameters(vacmet.m2)[c(1, 3), 2], col = 2, lwd = 2)