Individual leaning is measured by analyzing the content produced by an users: the link it published the likes it gave the tweets it retweeted …
gab is a far right social media and almost everything was (is) about politics, available here or here
original URL | expanded URL | domain |
---|---|---|
http://bit.ly/2zDszOz | https://www.churchmilitant.com/news/article/israel-uncooperative-after-historic-church-in-the-holy-land-burned-vandaliz | www.churchmilitant.com |
original URL | expanded URL | domain | bias | numerical value |
---|---|---|---|---|
http://bit.ly/2zDszOz | https://www.churchmilitant.com/news/article/israel-uncooperative-after-historic-church-in-the-holy-land-burned-vandaliz | www.churchmilitant.com | extreme right | +1 |
library(data.table)
setDTthreads(0)
user_data=readRDS("gab_users.rds")
head(user_data)
*for each users, we measure its leaning by taking the average leaning of all its posts
user_pol=user_data[!is.na(bias), .(pol=mean(num_bias), .N), by=.(user.id)]
user_pol[,pol:=round(user_pol$pol, digits = 2)]
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
library(ggridges)
ggplot(user_pol,aes(x=pol, y=rep(1,nrow(user_pol)), fill= after_stat(x)))+
geom_density_ridges_gradient()+
scale_fill_viridis(breaks=c(-1,1), labels=c("Extreme Left","Extreme Right"))+
xlim(-1,1)+theme_bw()+
theme(
legend.key.width = unit(1, "cm"),
legend.title = element_blank(),
legend.background = element_rect(color = "transparent"),
legend.position = c(0.3,0.88),
legend.direction = "horizontal",
legend.spacing.y = unit(0.4, 'cm'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size=20)) +
labs(x = expression(paste("Polarization")), y = "PDF")
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 0.0621
homophily is measured by analyzing how users interacts among themselves:
follower/following retweet comment co-commenting *…
Load (some) users about interaction on gab
data=fread("/Users/gale/Desktop/phd/Lessons/echo_chambers/Gab_1711-1810_url.csv.gz", nrows=10, skip=0)
clnames=colnames(data)
data=fread("/Users/gale/Desktop/phd/Lessons/echo_chambers/Gab_1711-1810_url.csv.gz", nrows=5000, skip=3000000, col.names = clnames)
data=data[(is.na(parent_id) | !is.na(conversation_parent_id))]
head(data)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(Matrix)
users_cascade=data[is_reply==TRUE & !is.na(parent_id),.(user.id,user.username, .N), by=.(parent_id)]
#keep only the cascade > 1
users_cascade<-users_cascade[N>1]
u_name<-as.data.table(data.frame(user.id=users_cascade$user.id,user_username=users_cascade$user.username,alias_user.id=as.numeric(as.factor(as.character(users_cascade$user.id)))))
u_name<-distinct(u_name,alias_user.id, .keep_all = TRUE)
u_name<-u_name[order(alias_user.id)]
edgelist<-data.frame(user.id=as.numeric(as.factor(as.character(users_cascade$user.id))),conv_id=as.numeric(as.factor(as.character(users_cascade$parent_id))))
adj_mat<-sparseMatrix(i=edgelist$user.id, j=edgelist$conv_id, x=rep(1,nrow(edgelist)))
head(summary(adj_mat))
user_proj<-adj_mat %*% t(adj_mat)
diag(user_proj)<-0
head(summary(user_proj))
library(ggraph)
gab_conversation_graph<-graph_from_adjacency_matrix(user_proj, mode="undirected")
V(gab_conversation_graph)$id<-u_name$user.id
V(gab_conversation_graph)$username<-as.character(u_name$user_username)
V(gab_conversation_graph)$cluster=cluster_louvain(gab_conversation_graph)$membership
V(gab_conversation_graph)$degree=degree(gab_conversation_graph)
vcount(gab_conversation_graph)
## [1] 295
ecount(gab_conversation_graph)
## [1] 1318
ggraph(gab_conversation_graph, layout = "fr") +
geom_edge_link() +
geom_node_point(aes(color=as.factor(cluster), size=degree))+
theme_void()+
theme(legend.position = "none")
users_we_have=user_pol[as.character(user.id) %in% as.character(V(gab_conversation_graph)$id)]
users_we_have=users_we_have[order(match(user.id,V(gab_conversation_graph)$id))]
gab_conversation_graph=set_vertex_attr(gab_conversation_graph,"n_leaning",index = V(gab_conversation_graph)[as.character(id) %in% as.character(users_we_have$user.id)],users_we_have$pol)
ggraph(gab_conversation_graph, layout = "fr") +
geom_edge_link() +
geom_node_point(aes(color=n_leaning, size=degree))+
theme_void()
#theme(legend.position = "none")
bar_data=data.table(leaning=V(gab_conversation_graph)$n_leaning,
cluster=as.factor(V(gab_conversation_graph)$cluster))
bar_data=bar_data[,.(avg_leaning=mean(leaning, na.rm=T),.N),by=.(cluster)]
bar_data=bar_data[order(avg_leaning)]
ggplot(bar_data[!is.nan(avg_leaning)], aes(x=1:nrow(bar_data[!is.nan(avg_leaning)]),y=N,fill=avg_leaning))+
geom_bar(stat="identity")+
theme_classic()+
scale_fill_viridis(limits=c(-1,1))+
labs(x="cluster ID", y="number of users")
compute the average leaning of the neighbours of each node and plot them in a 2D graph
nn_data=as_long_data_frame(gab_conversation_graph)
setDT(nn_data)
neighbors_data=rbind(nn_data[from_id!=to_id,
.(nn_leaning=mean(to_n_leaning,na.rm=T),
leaning=unique(from_n_leaning)),
by=.(id=from_id)],
nn_data[from_id!=to_id,
.(nn_leaning=mean(from_n_leaning, na.rm=T),
leaning=unique(to_n_leaning)),
by=.(id=to_id)])
neighbors_data=distinct(neighbors_data,id,.keep_all = T)
ggplot(neighbors_data[!is.na(leaning) & !is.nan(nn_leaning)], aes(x=leaning, y=nn_leaning))+
geom_density_2d_filled()+
geom_point(alpha=0.5)+
scale_fill_viridis(option = "magma", discrete=T)+
theme_classic()+
theme(legend.position = "none")
data?
function to compute correspondence analysis on a matrix of interction
CA <- function (obj, nd = NA, suprow = NA, supcol = NA, subsetrow = NA,
subsetcol = NA, verbose=TRUE)
{
if (verbose) message("Preparing matrix object...")
nd0 <- nd
I <- dim(obj)[1]
J <- dim(obj)[2]
rn <- dimnames(obj)[[1]]
cn <- dimnames(obj)[[2]]
N <- matrix(as.matrix(obj), nrow = I, ncol = J)
Ntemp <- N
NtempC <- NtempR <- N
rm("N")
suprow <- sort(suprow)
supcol <- sort(supcol)
if (!is.na(supcol[1]) & !is.na(suprow[1])) {
NtempC <- Ntemp[-suprow, ]
NtempR <- Ntemp[, -supcol]
}
if (!is.na(supcol[1])) {
SC <- as.matrix(NtempC[, supcol])
Ntemp <- Ntemp[, -supcol]
cs.sum <- apply(SC, 2, sum)
}
rm("NtempC")
if (!is.na(suprow[1])) {
SR <- matrix(as.matrix(NtempR[suprow, ]), nrow = length(suprow))
Ntemp <- Ntemp[-suprow, ]
rs.sum <- apply(SR, 1, sum)
}
rm("NtempR")
N <- matrix(as.matrix(Ntemp), nrow = dim(Ntemp)[1], ncol = dim(Ntemp)[2])
subsetrowt <- subsetrow
if (!is.na(subsetrow[1]) & !is.na(suprow[1])) {
subsetrowi <- subsetrow
subsetrowt <- sort(c(subsetrow, suprow))
subsetrowt <- subsetrowt[!duplicated(subsetrowt)]
I <- length(subsetrowt)
for (q in length(suprow):1) {
subsetrow <- subsetrow[subsetrow != suprow[q]]
subsetrow <- subsetrow - as.numeric(suprow[q] < subsetrow)
}
for (q in 1:length(suprow)) suprow[q] <- (1:length(subsetrowt))[subsetrowt ==
suprow[q]]
}
subsetcolt <- subsetcol
if (!is.na(subsetcol[1]) & !is.na(supcol[1])) {
subsetcoli <- subsetcol
subsetcolt <- sort(c(subsetcol, supcol))
subsetcolt <- subsetcolt[!duplicated(subsetcolt)]
J <- length(subsetcolt)
for (q in length(supcol):1) {
subsetcol <- subsetcol[subsetcol != supcol[q]]
subsetcol <- subsetcol - as.numeric(supcol[q] < subsetcol)
}
for (q in 1:length(supcol)) supcol[q] <- (1:length(subsetcolt))[subsetcolt ==
supcol[q]]
}
dim.N <- dim(N)
if (!is.na(subsetrow[1])) {
if (!is.na(supcol[1]))
SC <- as.matrix(SC[subsetrow, ])
}
if (!is.na(subsetcol[1])) {
if (!is.na(suprow[1]))
SR <- matrix(as.matrix(SR[, subsetcol]), nrow = length(suprow))
}
if (is.na(subsetrow[1]) & is.na(subsetcol[1])) {
nd.max <- min(dim.N) - 1
}
else {
N00 <- N
if (!is.na(subsetrow[1]))
N00 <- N00[subsetrow, ]
if (!is.na(subsetcol[1]))
N00 <- N00[, subsetcol]
dim.N <- dim(N00)
nd.max <- min(dim.N)
if (!is.na(subsetrow[1]) & is.na(subsetcol[1])) {
if (dim.N[1] > dim.N[2])
nd.max <- min(dim.N) - 1
}
else {
if (is.na(subsetrow[1]) & !is.na(subsetcol[1])) {
if (dim.N[2] > dim.N[1]) {
nd.max <- min(dim.N) - 1
}
}
}
}
if (verbose) message("Standardizing matrix...")
if (is.na(nd) | nd > nd.max)
nd <- nd.max
n <- sum(N)
P <- N/n
rm <- apply(P, 1, sum)
cm <- apply(P, 2, sum)
eP <- rm %*% t(cm)
S <- (P - eP)/sqrt(eP)
rm("eP")
rm("P")
if (!is.na(subsetcol[1])) {
S <- S[, subsetcol]
cm <- cm[subsetcol]
cn <- cn[subsetcolt]
}
if (!is.na(subsetrow[1])) {
S <- S[subsetrow, ]
rm <- rm[subsetrow]
rn <- rn[subsetrowt]
}
#chimat <- S^2 * n
if (verbose) message("Computing SVD...")
dec <- svd(S)
sv <- dec$d[1:nd.max]
u <- dec$u
v <- dec$v
ev <- sv^2
cumev <- cumsum(ev)
totin <- sum(ev)
rin <- apply(S^2, 1, sum)
cin <- apply(S^2, 2, sum)
rm("S")
rm("dec")
rachidist <- sqrt(rin/rm)
cachidist <- sqrt(cin/cm)
rchidist <- rep(NA, I)
cchidist <- rep(NA, J)
if (!is.na(subsetrow[1])) {
obj <- obj[subsetrowt, ]
}
if (!is.na(subsetcol[1])) {
obj <- obj[, subsetcolt]
}
###
if (!is.na(suprow[1])) {
if (is.na(supcol[1])) {
P.stemp <- matrix(as.matrix(obj[suprow, ]), nrow = length(suprow))
}
else {
P.stemp <- matrix(as.matrix(obj[suprow, -supcol]),
nrow = length(suprow))
}
P.stemp <- P.stemp/apply(P.stemp, 1, sum)
P.stemp <- t((t(P.stemp) - cm)/sqrt(cm))
rschidist <- sqrt(apply(P.stemp^2, 1, sum))
rchidist[-suprow] <- rachidist
rchidist[suprow] <- rschidist
rm("P.stemp")
}
else rchidist <- rachidist
if (!is.na(supcol[1])) {
if (is.na(suprow[1])) {
P.stemp <- as.matrix(obj[, supcol])
}
else P.stemp <- as.matrix(obj[-suprow, supcol])
P.stemp <- t(t(P.stemp)/apply(P.stemp, 2, sum))
P.stemp <- (P.stemp - rm)/sqrt(rm)
cschidist <- sqrt(apply(P.stemp^2, 2, sum))
cchidist[-supcol] <- cachidist
cchidist[supcol] <- cschidist
rm("P.stemp")
}
else cchidist <- cachidist
phi <- as.matrix(u[, 1:nd])/sqrt(rm)
gam <- as.matrix(v[, 1:nd])/sqrt(cm)
if (verbose) message("Projecting rows...")
if (!is.na(suprow[1])) {
cs <- cm
gam.00 <- gam
base2 <- SR/matrix(rs.sum, nrow = nrow(SR), ncol = ncol(SR))
base2 <- t(base2)
cs.0 <- matrix(cs, nrow = nrow(base2), ncol = ncol(base2))
svphi <- matrix(sv[1:nd], nrow = length(suprow), ncol = nd,
byrow = TRUE)
base2 <- base2 - cs.0
phi2 <- (t(as.matrix(base2)) %*% gam.00)/svphi
phi3 <- matrix(NA, ncol = nd, nrow = I)
phi3[suprow, ] <- phi2
phi3[-suprow, ] <- phi
rm0 <- rep(NA, I)
rm0[-suprow] <- rm
P.star <- SR/n
rm0[suprow] <- NA
rin0 <- rep(NA, I)
rin0[-suprow] <- rin
rin <- rin0
}
if (verbose) message("Projecting columns...")
if (!is.na(supcol[1])) {
rs <- rm
phi.00 <- phi
base2 <- SC/matrix(cs.sum, nrow = nrow(SC), ncol = ncol(SC),
byrow = TRUE)
rs.0 <- matrix(rs, nrow = nrow(base2), ncol = ncol(base2))
svgam <- matrix(sv[1:nd], nrow = length(supcol), ncol = nd,
byrow = TRUE)
base2 <- base2 - rs.0
gam2 <- (as.matrix(t(base2)) %*% phi.00)/svgam
gam3 <- matrix(NA, ncol = nd, nrow = J)
gam3[supcol, ] <- gam2
gam3[-supcol, ] <- gam
cm0 <- rep(NA, J)
cm0[-supcol] <- cm
P.star <- SC/n
cm0[supcol] <- NA
cin0 <- rep(NA, J)
cin0[-supcol] <- cin
cin <- cin0
}
if (exists("phi3"))
phi <- phi3
if (exists("gam3"))
gam <- gam3
if (exists("rm0"))
rm <- rm0
if (exists("cm0"))
cm <- cm0
ca.output <- list(sv = sv, nd = nd0, rownames = rn, rowmass = rm,
rowdist = rchidist, rowinertia = rin, rowcoord = phi,
rowsup = suprow, colnames = cn, colmass = cm, coldist = cchidist,
colinertia = cin, colcoord = gam, colsup = supcol, call = match.call())
class(ca.output) <- "ca"
if (verbose) message("Done!")
return(ca.output)
}
rt_edges=fread("/Users/gale/Desktop/phd/Lessons/echo_chambers/Retweet_network_cop26.csv.gz", colClasses = "character")
rt_edges[,Edge_weight:=as.numeric(Edge_weight)]
influencers=fread("/Users/gale/Desktop/phd/Lessons/echo_chambers/cop26_top_30_pg.csv", colClasses = "character")
head(rt_edges)
head(influencers)
create edgelist convert to a graph and
edge_list=rt_edges[,.(Retweeter_id,Tweeter_id,weight=as.numeric(Edge_weight))]
g=graph_from_data_frame(edge_list,directed = T)
g=igraph::simplify(g,edge.attr.comb = "sum")
interaction_matrix=get.adjacency(g,type="upper",attr = "weight", sparse = igraph_opt("sparsematrices"))
## Warning: `get.adjacency()` was deprecated in igraph 2.0.0.
## ℹ Please use `as_adjacency_matrix()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
interaction_matrix=interaction_matrix[!rownames(interaction_matrix) %in% influencers$id,]
interaction_matrix=interaction_matrix[,colnames(interaction_matrix) %in% influencers$id]
head(interaction_matrix)
## 6 x 30 sparse Matrix of class "dgCMatrix"
## [[ suppressing 30 column names '7587032', '14224719', '14353202' ... ]]
##
## 509 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## 614 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## 2724 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## 2737 . . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . .
## 4276 . . . . 2 . . . . . . . . . . . . . . . . . . . . . . . . .
## 4663 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
clean the matrix from noise
interaction_matrix=as.matrix(interaction_matrix)
cs= colSums(interaction_matrix>0)
interaction_matrix=interaction_matrix[,cs>0]
rs = rowSums(interaction_matrix>0)
interaction_matrix=interaction_matrix[rs>1,]
apply CA algorithm
res = CA(interaction_matrix, nd=3)
## Preparing matrix object...
## Standardizing matrix...
## Computing SVD...
## Projecting rows...
## Projecting columns...
## Done!
names(res)
## [1] "sv" "nd" "rownames" "rowmass" "rowdist"
## [6] "rowinertia" "rowcoord" "rowsup" "colnames" "colmass"
## [11] "coldist" "colinertia" "colcoord" "colsup" "call"
#head(res)
####visualization of the results:
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
library(dplyr)
influencers_est=data.table(id=res$colnames,est=rescale(res$colcoord[,1],to=c(-1,1)))
user_est=data.table(id=res$rownames,est=rescale(res$rowcoord[,1],to=c(-1,1)))
rt_table=distinct(edge_list,Retweeter_id,Tweeter_id)
rt_table=rt_table[Tweeter_id %in% influencers_est$id]
user_est=merge(user_est,rt_table,by.x="id",by.y="Retweeter_id", all.x=T)
influencers_est=merge(influencers_est,user_est[,.(med_est=median(est),
avg_est=mean(est),
min_val=min(est),
max_val=max(est),
q_1 = quantile(est,probs=0.25),
q_3 = quantile(est, probs = 0.75)),by=.(Tweeter_id)],
by.x="id", by.y="Tweeter_id", all.x=T)
influencers_est[,iqr:=q_3-q_1]
influencers_est=merge(influencers_est,influencers[,.(id,name,group)],by.x="id",by.y="id")
influencers_est=influencers_est[order(med_est,decreasing = T)]
influencers_est[,xpos:=ifelse(med_est<0,
ifelse(min_val>q_1-1.5*iqr,min_val,q_1-1.5*iqr),
ifelse(max_val<q_3+1.5*iqr,max_val,q_3+1.5*iqr))]
user_est=merge(user_est,influencers_est[,.(id,group,med_est)], by.x="Tweeter_id", by.y="id")
user_est[,Tweeter_id:=factor(Tweeter_id,levels = influencers_est$id)]
head(user_est)
create two plots and merge after:
ide=ggplot()+
geom_boxplot(data=user_est, aes(x=est, y=Tweeter_id, group=Tweeter_id, color=med_est),
show.legend = T,
outlier.size = 0.2,
outlier.alpha = 0.05)+
geom_text(data=influencers_est[med_est<0], aes(x=xpos-0.01,y=id,label=name),color="black", show.legend = F, hjust =1)+
geom_text(data=influencers_est[med_est>0], aes(x=xpos+0.01,y=id,label=name), color="black", show.legend = FALSE, hjust = 0)+
theme_classic()+
scale_x_continuous(limits = c(-1.5, 1.5))+
theme(legend.position = c(0.5,0.2),
legend.direction="horizontal",
legend.background = element_rect(fill = NA, colour = NA),
legend.title = element_blank(),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x =element_blank(),
axis.text.x = element_blank(),
text=element_text(size=14),
plot.margin=unit(c(1,1,0,1), "cm")
)+
scale_color_gradient(low = muted("red"), high=muted("green"),
breaks=c(-1,1),labels=c("Denial","Pro"), limits=c(-1,1))+
guides(color=guide_colorbar(direction = "horizontal", ticks=F, barwidth = 10))
ide
hist_data=rbind(user_est[,.(est=unique(est), type="user"), by=.(id)],
influencers_est[,.(est=med_est, type="influencer"), by=.(id)])
density_plot=ggplot(hist_data, aes(x=est,fill=type))+
#geom_area(alpha=0.6)+
geom_histogram(binwidth = 0.05,alpha=0.5, aes(y = ..density..), position = "identity")+
xlim(-1.5,1.5)+
#scale_y_continuous(breaks = seq(0, 9, by = 3), limits = c(0,12))+
theme_classic()+
#scale_fill_manual(values = binary_colors)+
theme(legend.title = element_blank(),#)+#,
legend.position = c(0.5,0.7),
text=element_text(size=14),
plot.margin=unit(c(0.1,1,1,1), "cm")
)+
labs(x="Latent ideology", y="density")
density_plot
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
all together now!
library(ggpubr)
ggarrange(ide,density_plot,nrow=2, heights = c(0.6, 0.4), align = "v")
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Cinelli, M., De Francisci Morales, G., Galeazzi, A., Quattrociocchi, W., & Starnini, M. (2021). The echo chamber effect on social media. Proceedings of the National Academy of Sciences, 118(9), e2023301118.
Flamino, J., Galeazzi, A., Feldman, S., Macy, M. W., Cross, B., Zhou, Z., … & Szymanski, B. K. (2023). Political polarization of news media and influencers on Twitter in the 2016 and 2020 US presidential elections. Nature Human Behaviour, 7(6), 904-916.
Falkenberg, M., Galeazzi, A., Torricelli, M., Di Marco, N., Larosa, F., Sas, M., … & Baronchelli, A. (2022). Growing polarization around climate change on social media. Nature Climate Change, 12(12), 1114-1121.
Galeazzi, A., Peruzzi, A., Brugnoli, E., Delmastro, M., & Zollo, F. (2024). Unveiling the hidden agenda: Biases in news reporting and consumption. PNAS nexus, 3(11), pgae474.
Baqir, A., Galeazzi, A., Drocco, A., & Zollo, F. (2023). Social media polarization reflects shifting political alliances in pakistan. arXiv preprint arXiv:2309.08075.
Baqir, A., Chen, Y., Diaz-Diaz, F., Kiyak, S., Louf, T., Morini, V., … & Galeazzi, A. (2023). Beyond active engagement: the significance of lurkers in a polarized Twitter debate. arXiv preprint arXiv:2306.17538.
Find more studies on my google scholar!