* My professor uploaded this as a guide. I need clarification on if my answer matches your answer before I submit my assignment because I am coming up with errors and I want to learn.
I have uploaded a zip file and their is an excel csv file in the data folder. I tried to upload them separately but this website would not allow myself. This is all the information I have.
# ———– Chapter 14: Chapter 14: Word Perfect ———–
library(XML)
library(tm)
#read the speech – the actual file location will need to be updated
sbaFile <- “/Users/jsaltz/Google Drive/Courses/IST 687/2U/Week 8 – Text Mining/data/sba-speech.txt”
sbaFile <-read.csv(“sample.csv”, stringsAsFactors = F )
head(sbaFile)
sbaFile <-sbaFile$text
head(sbaFile)
#use scan
#sba <- scan(sbaFile, character(0),sep = “n”)
#sba <- scan(sbaFile, character(0))
#head(sba, 10)
#use readLines
# sba <- readLines(sbaFile)
# head(sba, 3)
#Use a web file: Note the web location for the speech
sbaLocation <- URLencode(“http://www.historyplace.com/speeches/anthony.htm”)
# Read and parse HTML file
doc.html = htmlTreeParse(sbaLocation, useInternal = TRUE)
# Extract all the paragraphs (HTML tag is p, starting at
# the root of the document). Unlist flattens the list to
# create a character vector.
sba = unlist(xpathApply(doc.html, ‘//p’, xmlValue))
head(sba, 3)
words.vec <- VectorSource(sba)
words.corpus <- Corpus(words.vec)
words.corpus
words.corpus <- tm_map(words.corpus, content_transformer(tolower))
words.corpus <- tm_map(words.corpus, removePunctuation)
words.corpus <- tm_map(words.corpus, removeNumbers)
words.corpus <- tm_map(words.corpus, removeWords, stopwords(“english”))
tdm <- TermDocumentMatrix(words.corpus)
tdm
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
wordCounts <- sort(wordCounts, decreasing=TRUE)
head(wordCounts)
library(wordcloud)
cloudFrame <- data.frame(word = names(wordCounts), freq=wordCounts)
wordcloud(cloudFrame$word, cloudFrame$freq)
wordcloud(names(wordCounts), wordCounts, min.freq=2, max.words=50, rot.per=0.35,
colors=brewer.pal(8, “Dark2”))


0 comments