This is to give you some tools for calculating the Fightin Words statistic, extracting top-ranked terms, and plotting. Bare bones at the moment.
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)
fwgroups <- function(dtm, groups, pair = NULL, weights = rep(1,nrow(dtm)), k.prior = .1) {
weights[is.na(weights)] <- 0
weights <- weights/mean(weights)
zero.doc <- rowSums(dtm)==0 | weights==0
zero.term <- colSums(dtm[!zero.doc,])==0
dtm.nz <- apply(dtm[!zero.doc,!zero.term],2,"*", weights[!zero.doc])
g.prior <- tcrossprod(rowSums(dtm.nz),colSums(dtm.nz))/sum(dtm.nz)
#
g.posterior <- as.matrix(dtm.nz + k.prior*g.prior)
groups <- groups[!zero.doc]
groups <- droplevels(groups)
g.adtm <- as.matrix(aggregate(x=g.posterior,by=list(groups=groups),FUN=sum)[,-1])
rownames(g.adtm) <- levels(groups)
g.ladtm <- log(g.adtm)
g.delta <- t(scale( t(scale(g.ladtm, center=T, scale=F)), center=T, scale=F))
g.adtm_w <- -sweep(g.adtm,1,rowSums(g.adtm)) # terms not w spoken by k
g.adtm_k <- -sweep(g.adtm,2,colSums(g.adtm)) # w spoken by groups other than k
g.adtm_kw <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k
g.se <- sqrt(1/g.adtm + 1/g.adtm_w + 1/g.adtm_k + 1/g.adtm_kw)
g.zeta <- g.delta/g.se
g.counts <- as.matrix(aggregate(x=dtm.nz, by = list(groups=groups), FUN=sum)[,-1])
if (!is.null(pair)) {
pr.delta <- t(scale( t(scale(g.ladtm[pair,], center = T, scale =F)), center=T, scale=F))
pr.adtm_w <- -sweep(g.adtm[pair,],1,rowSums(g.adtm[pair,]))
pr.adtm_k <- -sweep(g.adtm[pair,],2,colSums(g.adtm[pair,])) # w spoken by groups other than k
pr.adtm_kw <- sum(g.adtm[pair,]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
pr.se <- sqrt(1/g.adtm[pair,] + 1/pr.adtm_w + 1/pr.adtm_k + 1/pr.adtm_kw)
pr.zeta <- pr.delta/pr.se
return(list(zeta=pr.zeta[1,], delta=pr.delta[1,],se=pr.se[1,], counts = colSums(dtm.nz), acounts = colSums(g.adtm)))
} else {
return(list(zeta=g.zeta,delta=g.delta,se=g.se,counts=g.counts,acounts=g.adtm))
}
}
############## FIGHTIN' WORDS PLOTTING FUNCTION
# helper function
makeTransparent<-function(someColor, alpha=100)
{
newColor<-col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
fw.ggplot.groups <- function(fw.ch, groups.use = as.factor(rownames(fw.ch$zeta)), max.words = 50, max.countrank = 400, colorpalette=rep("black",length(groups.use)), sizescale=2, title="Comparison of Terms by Groups", subtitle = "", caption = "Group-specific terms are ordered by Fightin' Words statistic (Monroe, et al. 2008)") {
if (is.null(dim(fw.ch$zeta))) {## two-group fw object consists of vectors, not matrices
zetarankmat <- cbind(rank(-fw.ch$zeta),rank(fw.ch$zeta))
colnames(zetarankmat) <- groups.use
countrank <- rank(-(fw.ch$counts))
} else {
zetarankmat <- apply(-fw.ch$zeta[groups.use,],1,rank)
countrank <- rank(-colSums(fw.ch$counts))
}
wideplotmat <- as_tibble(cbind(zetarankmat,countrank=countrank))
wideplotmat$term=names(countrank)
#rankplot <- gather(wideplotmat, party, zetarank, 1:ncol(zetarankmat))
rankplot <- gather(wideplotmat, groups.use, zetarank, 1:ncol(zetarankmat))
rankplot$plotsize <- sizescale*(50/(rankplot$zetarank))^(1/4)
rankplot <- rankplot[rankplot$zetarank < max.words + 1 & rankplot$countrank<max.countrank+1,]
rankplot$groups.use <- factor(rankplot$groups.use,levels=groups.use)
p <- ggplot(rankplot, aes((nrow(rankplot)-countrank)^1, -(zetarank^1), colour=groups.use)) +
geom_point(show.legend=F,size=sizescale/2) +
theme_classic() +
theme(axis.ticks=element_blank(), axis.text=element_blank() ) +
ylim(-max.words,40) +
facet_grid(groups.use ~ .) +
geom_text_repel(aes(label = term), size = rankplot$plotsize, point.padding=.05,
box.padding = unit(0.20, "lines"), show.legend=F) +
scale_colour_manual(values = alpha(colorpalette, .7)) +
# labs(x="Terms used more frequently overall →", y="Terms used more frequently by group →", title=title, subtitle=subtitle , caption = caption)
labs(x=paste("Terms used more frequently overall -->"), y=paste("Terms used more frequently by group -->"), title=title, subtitle=subtitle , caption = caption)
}
fw.keys <- function(fw.ch,n.keys=10) {
n.groups <- nrow(fw.ch$zeta)
keys <- matrix("",n.keys,n.groups)
colnames(keys) <- rownames(fw.ch$zeta)
for (g in 1:n.groups) {
keys[,g] <- names(sort(fw.ch$zeta[g,],dec=T)[1:n.keys])
}
keys
}
Load the data
poliblog.dfm <- readRDS("poliblog5k.dfm.rds")
poliblog.meta <- readRDS("poliblog5k.fullmeta.rds")
Calculate FW.
fw.blogideo <- fwgroups(poliblog.dfm,groups = poliblog5k.meta$rating)
Get and show the top words per group by zeta.
library(knitr)
fwkeys.blogideo <- fw.keys(fw.blogideo, n.keys=20)
kable(fwkeys.blogideo)
Conservative | Liberal |
---|---|
israel | mccain |
obama | bush |
eastern | said |
wright | sen |
chicago | presid |
especi | linktocom |
isra | postcount |
hama | postcounttb |
russian | tortur |
exit | iraq |
ed | john |
v | campaign |
may | today |
tip | administr |
rezko | â |
via | pm |
border | ad |
palestinian | know |
either | dem |
pakistani | rove |
Plot
p.fw.blogideo <- fw.ggplot.groups(fw.blogideo,sizescale=4,max.words=200,max.countrank=400,colorpalette=c("red","blue"))
p.fw.blogideo
Calculate FW and keys
fw.blogs <- fwgroups(poliblog.dfm,groups = poliblog5k.meta$blog)
Get and show the top words per group by zeta.
library(knitr)
fwkeys.blogs <- fw.keys(fw.blogs, n.keys=15)
kable(fwkeys.blogs)
at | db | ha | mm | tp | tpm |
---|---|---|---|---|---|
israel | linktocom | especi | eastern | bush | obama |
mr | postcount | exit | r | sen | campaign |
isra | postcounttb | v | updat | iraq | hillari |
russian | pm | franken | pentagon | said | dem |
chicago | tortur | although | dead | mccain | ad |
hama | digbi | obama | romney | u. | poll |
jewish | film | sadr | illeg | rove | mccain |
palestinian | dday | either | alien | administr | â |
tip | matthew | hillari | confirm | watch | franken |
wright | like | begun | legisl | presid | senat |
obama | think | may | immigr | iraqi | et |
iranian | villag | want | build | tortur | camp |
iran | religi | rezko | bailout | http | race |
jew | republican | rather | paulson | r-az | lead |
pakistani | thing | least | la | percent | late |
Plot
p.fw.blogs <- fw.ggplot.groups(fw.blogs,sizescale=3,max.words=200,max.countrank=400)
p.fw.blogs
FW makes a better (in my opinion) extractor of keywords than FREX, Lift, Score, etc.
Load a topic model (The no metadata 20-topic model from the STM notebook).
stm.nm <- readRDS("poliblog5k.fit.nometa.rds")
Calculate expected word frequency per topic across corpus:
stm.nm.top_word_tots <- colSums(sweep(stm.nm$theta,1,rowSums(poliblog.dfm),"*"))
stm.nm.top_word_assigns <- sweep(exp(stm.nm.beta),1,stm.nm.top_word_tots,"*")
Calculate FW
fw.topics.stm <- fwgroups(stm.nm.top_word_assigns,groups = as.factor(1:20))
Get and show the top words per topic by zeta.
library(knitr)
fwkeys.stm.nm <- fw.keys(fw.topics.stm, n.keys=15)
kable(fwkeys.stm.nm)
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
law | democrat | think | women | don | obama | race | attack | obama | tax | obama | war | stori | bush | mccain | world | voter | governor | money | clinton |
congress | republican | know | health | didn | barack | gop | terrorist | senat | economi | mccain | iraq | report | presid | john | america | elect | offici | billion | hillari |
bill | parti | thing | life | doesn | campaign | dem | nuclear | debat | econom | poll | militari | global | administr | campaign | countri | vote | report | million | win |
administr | elect | peopl | care | show | polit | seat | terror | said | american | voter | forc | york | white | palin | war | organ | depart | crisi | campaign |
court | conserv | like | church | updat | black | senat | iran | polici | health | percent | troop | articl | news | romney | foreign | group | offic | market | democrat |
bush | gop | realli | children | get | chicago | rep | weapon | foreign | job | barack | iraqi | book | hous | governor | nation | union | case | hous | primari |
justic | vote | right | educ | eastern | church | sen | bomb | barack | compani | lead | armi | polic | said | attack | unit | illeg | committe | spend | convent |
legal | polit | liber | famili | reader | wright | ballot | israel | presid | cut | state | govern | build | watch | sen | georgia | state | feder | race | |
rule | candid | way | school | isn | michell | poll | threat | joe | energi | among | secur | dead | interview | alaska | nuclear | communiti | charg | govern | romney |
intellig | win | media | human | video | race | campaign | kill | biden | cost | florida | qaeda | confirm | don | experi | peac | california | former | fund | candid |
presid | hous | say | univers | love | america | count | east | meet | price | number | afghanistan | publish | report | sarah | south | ballot | staff | congress | parti |
judg | progress | guy | social | night | team | candid | muslim | campaign | will | ohio | pakistan | journalist | sen | talk | power | worker | investig | dollar | nomin |
protect | congress | someth | god | exit | advis | vote | qaeda | committe | plan | ralli | surg | climat | press | raz | intern | border | inform | compani | florida |
act | voter | linktocommentspostcount | research | see | candid | district | intellig | bill | oil | margin | command | warm | fox | huckabe | must | immigr | senat | street | michigan |
constitut | reagan | postcounttb | christian | sure | associ | challeng | peac | sen | increas | virginia | withdraw | news | vice | gov | govern | regist | justic | wall | carolina |
Plot
p.stm.nm <- fw.ggplot.topics(fw.topics.stm,sizescale=2,max.words=50,max.countrank=400)
p.stm.nm