# روش اول
install.packages("tidytext")
# روش دوم
library(devtools)
install_github("juliasilge/tidytext")
library(tidytext)
library(tidyverse)
df <- data_frame(text = c("به نام خداوند جان و خرد",
"کزین برتر اندیشه برنگذرد",
"خداوند نام و خداوند جای",
"خداوند روزی ده رهنمای"))
df %>% unnest_tokens(word, text)
rostam_esfandiar <- read.delim(file = "sample.txt", stringsAsFactors = F, header = F)
colnames(rostam_esfandiar) = "beyt"
regex_chapter <- "بخش [۰۱۲۳۴۵۶۷۸۹]+"
library(stringr)
rostam_esfandiar <- rostam_esfandiar %>%
mutate(chapter = cumsum(str_detect(beyt, regex_chapter)))
rostam_esfandiar %>% sample_n(10)
rostam_esfandiar <- rostam_esfandiar %>%
unnest_tokens(word, beyt) %>%
group_by(chapter, word) %>%
summarise(count = n()) %>%
arrange(chapter, desc(count))
rostam_esfandiar %>% head(10)
rostam_esfandiar.top10 <- rostam_esfandiar %>%
mutate(rank = rank(-count) %>% as.integer(),
rank = row_number(rank)) %>%
filter(rank <= 10)
p <- ggplot(rostam_esfandiar.top10, aes(x = reorder(word, -count), y = count, fill = as.factor(paste("بخش", chapter, sep = " ")))) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~chapter, ncol = 5, scales = "free") +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank()) +
coord_flip()
library(plotly)
ggplotly(p)
shahname <- read_csv("shahname.csv")
shahname %>% head(5)
shahname.words <- shahname %>%
unnest_tokens(word, text) %>%
group_by(book, word) %>%
summarise(count = n())
shahname.words %>% arrange(book, desc(count)) %>% head(5)
shahname.words <- shahname.words %>%
bind_tf_idf(word, book, count) %>%
arrange(book, desc(tf_idf))
shahname.words %>% head(10)
plot.data <- shahname.words %>%
mutate(rank = rank(-tf_idf) %>% as.integer(),
rank = row_number(rank)) %>%
filter(rank <= 10) %>% arrange(book, rank)
p <- ggplot(plot.data, aes(x = reorder(word, -tf_idf), y = tf_idf, fill = book)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free") +
theme_minimal()
ggplotly(p)
shahname.bigrams <- shahname %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
shahname.bigrams %>% head(5)
shahname.bigrams <- shahname.bigrams %>%
group_by(book, bigram) %>%
summarise(count = n()) %>%
arrange(book, desc(count))
shahname.bigrams %>% arrange(book, desc(count)) %>%head(5)
#3
shahname.bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
head(5)
shahname.bigrams <- shahname.bigrams %>%
bind_tf_idf(bigram, book, count) %>%
arrange(book, desc(tf_idf))
shahname.bigrams %>% head(5)
plot.data <- shahname.bigrams %>%
mutate(rank = rank(-tf_idf) %>% as.integer(), rank = row_number(rank)) %>%
filter(rank <= 10) %>%
arrange(book, rank)
p <- ggplot(plot.data, aes(x = reorder(bigram, -tf_idf), y = tf_idf, fill = book)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~book,scales = "free") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), axis.title.y = element_blank())
ggplotly(p)