-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw_notebook.Rmd
261 lines (245 loc) · 9.67 KB
/
hw_notebook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
---
title: "R Notebook"
output: html_notebook
---
```{r}
## prepare and require all packages needed
rm(list = ls(all = TRUE))
library(igraph)
library(data.table)
library(readxl)
library(splitstackshape)
library(stringr)
library(plyr)
```
```{r warning=FALSE}
## data preparation and cleaning
# import data
data1<-read.csv("E:/Fall term/Social Network Analysis/HW2/Funding_events_7.14.csv",header = TRUE)
data2<-read_xlsx("E:/Fall term/Social Network Analysis/HW2/Funding_events_7.14_page2.xlsx")
inv1<-data.frame(data1[,c(4,11)])
inv2<-data.frame(data2[,c(4,11)])
colnames(inv1)<-c("date","investors")
colnames(inv2)<-c("date","investors")
# combine all investors from two dataset (together with date, for Q2,Q3)
inv<-rbind(inv1,inv2)
inv<-inv[inv$investors!='',]
inv<-unique(inv)
# clean the data(found some wrong typing, delete them)
inv$investors<-as.character(inv$investors)
inv$investors<-gsub(', Inc.', '', inv$investors)
inv$investors<-gsub(', LLC', '', inv$investors)
inv$investors<-gsub(', Ltd.', '', inv$investors)
inv$investors<-gsub(', lp', '', inv$investors)
# split investor column
inv<-concat.split(inv,sep=",",split.col = 'investors',drop = TRUE)
# if investors_02 is 'NA', means only one investor in this row, so there is no relationship captured.
inv<-inv[inv$investors_02!='<NA>',]
## investors in the same row have relationships between each other
edgelist<-data.frame(inv[,c(1,2,3)])
colnames(edgelist)<-c("date","F1","F2")
for (i in 2:26){
for(j in (i+1):27){
d<-data.frame(inv[,c(1,get('i'),get('j'))])
d<-na.omit(d)
colnames(d)<-c("date","F1","F2")
edgelist<-rbind(edgelist,d)
}
}
edgelist1<-unique(edgelist[,c(2,3)])
# build network
g_inv <- graph.data.frame(edgelist1, directed = FALSE)
g_inv <- delete.edges(g_inv, E(g_inv)[get.edge.attribute(g_inv)==0])
```
```{r warning=FALSE}
# Question 1: Which company is the center of VC firms
## Q1---A: find the center
close<-data.frame(closeness(g_inv))
#head(closeness)
max_closensess<-row.names(close)[which.max(close[,1])]
print(paste("Center of network is",max_closensess))
## Intel Capital is the center of network
## Q1--B:find the firm with least average shortest path
# calculate the average shortest path for each vertex in the network
N <- gorder(g_inv)
spathAvg <- lapply(V(g_inv),
function(v){
q <- shortest.paths(g_inv, v)
i<-which(q=='Inf')
q[i]<-N
rowSums(q,na.rm = TRUE)/N
})
which.min(spathAvg)
###Verify that the firm with highness closeness has the least average shortest path, which is intel capital
## Q1--C. Calculate the average shortest path for all firms.
path<-mean_distance(g_inv,directed = FALSE,unconnected = FALSE)
print(paste("average shortest path for all firms is",path))
# The answer is 969.7391, which is high. I think it's because many pair of nodes are unreachable, which leads to many length of path is equal to total # of nodes.Thus the answer is high.
```
```{r}
#Q2
#A: coreness change over time
## give each tie(relationship) a month index according to their deal date
edgelist<-na.omit(edgelist)
edgelist<-concat.split(edgelist,sep="/",split.col = 'date',drop = TRUE)
edgelist$month<-0
mon<-0
for(i in 1981:2014){
for(j in 1:12){
index<-which(edgelist$date_1==j&edgelist$date_3==i)
mon<-mon+1
if(length(index)!=0){
edgelist$month[index]<-rep(mon,length(index))
}
}
}
edgelist$month<-edgelist$month-5
edgelist<-edgelist[,c(1,2,6)]
#calculate the average coreness at the each time point--month (still cumulative)
avg_core<-data.frame(1:max(edgelist$month))
colnames(avg_core)<-"month"
avg_core$coreness<-0
for(i in 1:max(edgelist$month)){
network<-edgelist[edgelist$month<=i,]
g<-graph.data.frame(network, directed = FALSE)
avg_core[i,2]<-mean(coreness(g))
}
plot(avg_core$month,avg_core$coreness,type='l')
```
```{r}
##B: coreness change over time allowing ties to 'decay'
## remove ties from the network if they are not renewed within 10 years
## if a tie is renewed within 10 years, it would appear in the edge list once again after the first appearance. Thus,after 10 years(120 months), e.g.: in 121 month, only calculate the month2 to mongth121, if relationship in month 1 is renewed, it would apprear during month 2 to month 121. If it's not, we ignore them.
avg_core_decay<-data.frame(1:max(edgelist$month))
colnames(avg_core_decay)<-"month"
avg_core_decay$coreness<-0
for(i in 1:max(edgelist$month)){
if(i<=120){
network<-edgelist[edgelist$month<=i,]
g<-graph.data.frame(network, directed = FALSE)
avg_core_decay[i,2]<-mean(coreness(g))
}else{
network<-edgelist[edgelist$month<=i,]
network<-network[network$month>i-120,]
g<-graph.data.frame(network, directed = FALSE)
avg_core_decay[i,2]<-mean(coreness(g))
}
}
plot(avg_core_decay$month,avg_core_decay$coreness,type = 'l')
## similar, not differ a lot
## I think it may show that core relationships in the network are usually long-lasting. The plot shows that removing those relationships that were not renewed within 10 years doesn't affect the coreness situation of the entire network.
```
```{r}
# Question 3: Is recent network more of a core-periphery structure or a structure made up of distinctly clustered components?
## When many nodes are members of k-cores with high degrees, this suggests that there may exist dense clusters within the network. Thus, if as the time went by, the number of nodes which have high coreness is increasing,the recent network is exhibits more of a structure made up of distinctly clustered components.
## first evidence
# look at the number of nodes which have coreness more than average coreness of network,divided by total number of nodes in the network. if the rate is increasing, means exhibits more of a clustered structure.
evidence1<-data.frame(1:max(edgelist$month))
evidence1$rate<-0
for(i in 1:max(edgelist$month)){
network<-edgelist[edgelist$month<=i,]
g<-graph.data.frame(network, directed = FALSE)
a<-coreness(g)
evidence1[i,2]<-length(a[a>mean(a)])/gorder(g)
}
plot(evidence1[,1],evidence1$rate,type='l')
## it turns out that the rate in the recent is decreasing,so I think the recent network exhibits more of a core-periphery structure.
```
```{r}
## evidence 2
##
evidence2<-data.frame(1:max(edgelist$month))
evidence2$median<-0
for(i in 1:max(edgelist$month)){
network<-edgelist[edgelist$month<=i,]
g<-graph.data.frame(network, directed = FALSE)
a<-coreness(g)
evidence2[i,2]<-median(a)
}
plot(evidence2[,1],evidence2$median,type='l')
##it turns out that median of coreness in the recent is also decreasing,so I think the recent network exhibits more of a core-periphery structure.
```
```{r warning=FALSE}
#Q4: whether being in the core of the network helps venture capital firms and the entrepreneurs they work with to perform better
#Question 4---A: being center~ have more successful investments
performance<-read.csv("E:/Fall term/Social Network Analysis/HW2/Venture_capital_firm_outcomes.csv",header = TRUE)
success<-performance[,c(1,2,3)]
colnames(close)<-"closeness"
close$firm<-rownames(close)
close[,3:36]<--1
for(i in 1:nrow(success)){
j<-which(close$firm==success[i,1])
year<-success[i,2]-1981+3
close[j,get('year')]<-success[i,3]
}
corr<-data.frame(1981:2014)
corr[,2]<-0
for(i in 3:36){
a<-close[,c(1,i)]
a<-a[a[,2]!=-1,]
corr[i-2,2]<-cor(a[,1],a[,2])
}
mean(corr[,2],na.rm = TRUE)
# average correlation is 0.0260
## The yearly averge of correlation between closeness and successful investments is positive, but the absolute number is very small. The relationship is not very significant.
## betweeness
between<-data.frame(betweenness(g_inv))
colnames(between)<-"betweenness"
between$firm<-rownames(between)
between[,3:36]<--1
for(i in 1:nrow(success)){
j<-which(between$firm==success[i,1])
year<-success[i,2]-1981+3
between[j,get('year')]<-success[i,3]
}
corr_betw<-data.frame(1981:2014)
corr_betw[,2]<-0
for(i in 3:36){
a<-between[,c(1,i)]
a<-a[a[,2]!=-1,]
corr_betw[i-2,2]<-cor(a[,1],a[,2])
}
mean(corr_betw[,2],na.rm = TRUE)
# average correlation is 0.4298
## correlation shows that yes, a venture capital firm being at the center of the network related to having more successful investments in a given year, in terms of its betweenness centrality
```
```{r}
#Question 4---B: being center~ less likely to go out of business
out<-performance[,c(1,2,4)]
close1<-close[,c(1,2)]
close1[,3:36]<--1
for(i in 1:nrow(out)){
j<-which(close1$firm==out[i,1])
year<-out[i,2]-1981+3
close1[j,get('year')]<-out[i,3]
}
corr1<-data.frame(1981:2014)
corr1[,2]<-0
for(i in 3:36){
a<-close1[,c(1,i)]
a<-a[a[,2]!=-1,]
corr1[i-2,2]<-cor(a[,1],a[,2])
}
mean(corr1[,2],na.rm = TRUE)
# average correlation is -0.26307
## The yearly averge of correlation between closeness and successful investments is negative. I think Yes,a venture capital firm being at the center of the network is related to being less likely to go out of business.
## betweeness
out<-performance[,c(1,2,4)]
between1<-between[,c(1,2)]
between1[,3:36]<--1
for(i in 1:nrow(out)){
j<-which(between1$firm==out[i,1])
year<-out[i,2]-1981+3
between1[j,get('year')]<-out[i,3]
}
corr_betw1<-data.frame(1981:2014)
corr_betw1[,2]<-0
for(i in 3:36){
a<-between1[,c(1,i)]
a<-a[a[,2]!=-1,]
corr_betw1[i-2,2]<-cor(a[,1],a[,2])
}
mean(corr_betw1[,2],na.rm = TRUE)
# average correlation is -0.1534289
## correlation shows that yes, a venture capital firm being at the center of the network related to being less likely to go out of business, in terms of its betweenness centrality
```