-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurls.r
186 lines (155 loc) · 5.5 KB
/
urls.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
REBOL [
Purpose: {
Get Everything from a URL
}
Note: {
I saw this script on REBOL.org:
http://www.rebol.org/view-script.r?script=url-handler.r
but the code seemed crazy-long for REBOL or too confusing for my liking
}
History: [
]
Details: [
Title: none
Creation: none
Name: none
Version: 1.0
File: %url-parse.r
Home: http://none
Programmer: "Time Series Lord"
Rights: "Copyright (c) 2016"
]
Tests: [
http://agora-dev.org//forums//archive//subsarch/index.php?site
http://agora-dev.org//forums/archive/index.php?site
http://agora-dev.org//forums/index.php?site
http://agora-dev.org/forums/archive//subarch/index.php?site
http://agora-dev.org//forums//archive//subarch/index.php?site
http://agora-dev.org//forums/archive/index.php?site
http://agora-dev.org//forums/index.php?site
http://agora-dev.org/forums/index.php?site
http://a.b.c.d.subby.dommy.toppy/archive/some/path/to/file/file.html?zz=b&hj=d&e=5&f=a%20b&sid=5¬hing=&re=bol#s3
http://www.rebol.com/docs/core23/rebolcore-10.html#section-2
http://www.rebol.com/
http://rebol.com/
http://rebol.com
]
]
comment {
for the agoura style URLS, gather a block of words that have //
later, after building the path, go and find the words and fix it
it's hacky but...
}
url-parse: func [
"Returns a URL object"
url [url!] "URL"
/local protocol rest file
cgi lag
][
;; return object
urlo: make object! [
up: top:
cgi: anchor:
subdomain: subdomains: domain: tld:
dir-path: directories:
protocol: file: url: none
]
;; full url as given
urlo/url: :url
;; split the url into the end point and path
url: split-path url
;; get the protocol and the rest for further processing
parse url/1 [copy protocol to "://" 3 skip copy rest to end]
;; protocol
urlo/protocol: to-word protocol
;; up start
either none? rest [
urlo/up: to-url rejoin [urlo/protocol "://" dirize url/2 ]
][
urlo/up: to-url url/1
]
;; top url
urlo/top: to-url rejoin [urlo/protocol "://" dirize third parse urlo/up "/"]
;; if it was only an end point url, i.e., parse copied nothing into rest
;; then trigger the nothing flag
if nothing: none? rest [rest: url/2]
;; for these style URLS, gather a block of words that have //
slashy: copy []
parse rest [some [thru "//" copy w to "/" (append slashy w) ]]
;; break up the rest, which is the directory tree
rest: parse rest "/"
;; clean up rest if it has empty ""
forall rest [if empty? rest/1 [remove rest]]
;; subdirectories, only if there (nothing sets above)
if not nothing [
urlo/directories: copy []
sd: copy next rest
;; if there are double slash directories
either not empty? slashy [
forall sd [
;; if it is in the slash mini db
either found? find slashy sd/1 [
;; insert sd/1 "/"
poke sd 1 to-refinement sd/1
][change sd to-word sd/1]
]
urlo/dir-path: to-path sd
forall sd [ append urlo/directories sd/1]
][
forall sd [ change sd to-word sd/1]
urlo/dir-path: to-path sd
forall sd [ append urlo/directories dirize to-file sd/1]
]
]
;; get all of the domain stuff
rest: parse rest/1 "."
;; get the top-level domain
;; parse last rest/1 [copy tld to slash to end]
urlo/tld: last rest
;; get the domain
urlo/domain: pick rest -1 + length? rest
;; get the subdomains
;; junky if 2 < length? rest [ subdom: first at rest -2 + length? rest ]
if 2 < length? rest [
urlo/subdomains: copy/part rest -2 + length? rest
;; get the actual subdomain
urlo/subdomain: last urlo/subdomains
]
;; get the file and if there, the cgi stuff or anchor
;; proceed if not a slash at end or not a tld at end
;; 3 < length? last parse url/2 ".
if all [
not-equal? slash last url/2
not-equal? urlo/tld last parse url/2 "."
][
parse url/2 [
[
[
copy file [ to "?" | to "#" ] marker: skip copy lag to end
]
| copy file to end
]
]
if not empty? lag [
any [
;; if RVC thinks it's CGI
if not empty? cgi: attempt [decode-cgi lag][urlo/cgi: cgi]
;; if it is "?site"
if all [
equal? #"?" first marker
equal? lag second parse marker to-string first marker
][
urlo/cgi: lag
]
]
;; if it isn't cgi, it must be an anchor
if none? urlo/cgi [
urlo/anchor: lag
]
]
;; file name set
urlo/file: file
]
;; REBOL always returns something
return urlo
]