Textual Data

Headlines data

We use News Aggregator Data Set from UCI Machine Learning Repository. Headlines and categories for 400k news items scraped from the web in 2014. Columns are:

ID : the numeric ID of the article
TITLE : the headline of the article
URL : the URL of the article
PUBLISHER : the publisher of the article
CATEGORY : the category of the news item; one of: — b : business — t : science and technology — e : entertainment — m : health
STORY : alphanumeric ID of the news story that the article discusses
HOSTNAME : hostname where the article was posted
TIMESTAMP : approximate timestamp of the article’s publication, given in Unix time (seconds since midnight on Jan 1, 1970)

# remove all the variables in the environment
rm(list=ls())

data<-read.csv("uci-news-aggregator.csv",fill=T, sep=",", stringsAsFactors = FALSE)
dim(data)

## [1] 422419      8

1	names(data)

## [1] "ID"        "TITLE"     "URL"       "PUBLISHER" "CATEGORY"  "STORY"    
## [7] "HOSTNAME"  "TIMESTAMP"

1
2
3

attach(data)
#
head(ID,5)

## [1] 1 2 3 4 5

1	head(TITLE,5)

## [1] "Fed official says weak data caused by weather, should not slow taper" 
## [2] "Fed's Charles Plosser sees high bar for change in pace of tapering"   
## [3] "US open: Stocks fall after Fed official hints at accelerated tapering"
## [4] "Fed risks falling 'behind the curve', Charles Plosser says"           
## [5] "Fed's Plosser: Nasty Weather Has Curbed Job Growth"

1	head(URL,5)

## [1] "http://www.latimes.com/business/money/la-fi-mo-federal-reserve-plosser-stimulus-economy-20140310,0,1312750.story\\?track=rss"
## [2] "http://www.livemint.com/Politics/H2EvwJSK2VE6OF7iK1g3PP/Feds-Charles-Plosser-sees-high-bar-for-change-in-pace-of-xfzta.html" 
## [3] "http://www.ifamagazine.com/news/us-open-stocks-fall-after-fed-official-hints-at-accelerated-tapering-294436"                 
## [4] "http://www.ifamagazine.com/news/fed-risks-falling-behind-the-curve-charles-plosser-says-294430"                              
## [5] "http://www.moneynews.com/Economy/federal-reserve-charles-plosser-weather-job-growth/2014/03/10/id/557011"

1	head(PUBLISHER,5)

## [1] "Los Angeles Times" "Livemint"          "IFA Magazine"     
## [4] "IFA Magazine"      "Moneynews"

1	head(CATEGORY,5)

## [1] "b" "b" "b" "b" "b"

1	head(STORY,5)

## [1] "ddUyU0VZz0BRneMioxUPQVP6sIxvM" "ddUyU0VZz0BRneMioxUPQVP6sIxvM"
## [3] "ddUyU0VZz0BRneMioxUPQVP6sIxvM" "ddUyU0VZz0BRneMioxUPQVP6sIxvM"
## [5] "ddUyU0VZz0BRneMioxUPQVP6sIxvM"

1	head(HOSTNAME,5)

## [1] "www.latimes.com"     "www.livemint.com"    "www.ifamagazine.com"
## [4] "www.ifamagazine.com" "www.moneynews.com"

1	head(TIMESTAMP,5)

## [1] 1.39447e+12 1.39447e+12 1.39447e+12 1.39447e+12 1.39447e+12

There are 422419 obervations in this dataset.

Time

The time of news range from 2014-03-10 16:52:50 GMT to 2014-08-28 12:33:11 GMT.

1 2	mytime <- as.POSIXct(TIMESTAMP/1000, origin="1970-01-01", tz = "GMT") range(mytime)

## [1] "2014-03-10 16:52:50 GMT" "2014-08-28 12:33:11 GMT"

Website

#======
# Website
temp <- strsplit(data$URL, "/")
len=length(temp)
wbsite <- character(len)
for(i in 1:len) wbsite[i]=temp[[i]][[3]]
head(wbsite)

## [1] "www.latimes.com"     "www.livemint.com"    "www.ifamagazine.com"
## [4] "www.ifamagazine.com" "www.moneynews.com"   "www.nasdaq.com"

data$WebSite<-wbsite
wbtab <- table(wbsite)
wbtab2 <- sort(table(wbsite), decreasing = T)
wbdf <- as.data.frame(wbtab2)
wbdf$Density <- wbdf$Freq/len
wbdf$wbsite[1:30]

##  [1] in.reuters.com              www.huffingtonpost.com     
##  [3] www.businessweek.com        www.contactmusic.com       
##  [5] www.dailymail.co.uk         www.nasdaq.com             
##  [7] www.examiner.com            www.globalpost.com         
##  [9] www.latimes.com             www.bizjournals.com        
## [11] www.rttnews.com             thecelebritycafe.com       
## [13] www.washingtonpost.com      www.entertainmentwise.com  
## [15] www.forbes.com              www.bloomberg.com          
## [17] www.nydailynews.com         www.marketwatch.com        
## [19] time.com                    perezhilton.com            
## [21] www.hngn.com                timesofindia.indiatimes.com
## [23] www.reuters.com             www.theguardian.com        
## [25] www.telegraph.co.uk         www.wetpaint.com           
## [27] au.ibtimes.com              blogs.wsj.com              
## [29] www.techtimes.com           www.business-standard.com  
## 11237 Levels: in.reuters.com www.huffingtonpost.com ... zumic.com

1 2	# Output as csv write.csv(wbdf, file = "website.csv")

Story

There are:

2076 clusters of similar news for entertainment category
1789 clusters of similar news for science and technology category
2019 clusters of similar news for business category
1347 clusters of similar news for health category

# Business
story_b = STORY[CATEGORY == "b"]
tsb <- as.data.frame(table(story_b))
dim(tsb)

## [1] 2019    2

# entertainment
story_e = STORY[CATEGORY == "e"]
tse <- as.data.frame(table(story_e))
dim(tse)

## [1] 2075    2

# health
story_m = STORY[CATEGORY == "m"]
tsm <- as.data.frame(table(story_m))
dim(tsm)

## [1] 1347    2

# science and technology
story_t = STORY[CATEGORY == "t"]
tst <- as.data.frame(table(story_t))
dim(tst)

## [1] 1789    2

Numerical Data

We use Dow Jones Industrial Average (DJIA) Index data here. And we collect the DJIA data from 2008-08-08 to 2016-07-01.

1 2	DJIA<-read.csv("DJIA_table.csv",fill=T, sep=",", stringsAsFactors = FALSE) dim(DJIA)

## [1] 1989    7

1	names(DJIA)

## [1] "Date"      "Open"      "High"      "Low"       "Close"     "Volume"   
## [7] "Adj.Close"

1	attach(DJIA)

Date

1	class(Date)

## [1] "character"

1	range(Date)

## [1] "2008-08-08" "2016-07-01"

Close Price

1 2	# Close Price summary(Close)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6547   10913   13026   13463   16478   18312

1
2
3

# Log Close Price
log_Close <- log(Close)
summary(log_Close)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.787   9.298   9.475   9.479   9.710   9.815

1
2
3

# Log Return
log_Return<-diff(log_Close, differences = 1)
summary(log_Return)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.1050835 -0.0057316 -0.0005430 -0.0002138  0.0045634  0.0820051

1 2	# Histogram hist(Close, freq=F, main="Close Price (DJIA) Histogram", col=c(2,3), xlab="Close Price")

1	hist(log_Close, freq=F, main="Log of Close Price (DJIA) Histogram", col=c(2,3), xlab="log(Close Price)")

1	hist(log_Return, freq=F, main="Log of Return (DJIA) Histogram", col=c(2,3), xlab="log(Return)")

1 2	# plot(as.Date(Date), Close, type = "l", xlab = "Date", ylab = "Close Price")

1	plot(as.Date(Date), log_Close, type = "l", xlab = "Date", ylab = "log(Close Price)")

1	plot(as.Date(Date[-1]), log_Return, type = "l", xlab = "Date", ylab = "log(Return)")

Welcome!

How to Simply Get the Overview of Textual Data and Numerical Data