1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
|
23 | |
|
24 | |
|
25 | |
|
26 | |
package com.mattunderscore.http.headers.useragent.parser; |
27 | |
|
28 | |
import static com.mattunderscore.http.headers.useragent.parser.ParsingUtils.contactInformation; |
29 | |
import static com.mattunderscore.http.headers.useragent.parser.ParsingUtils.nextElement; |
30 | |
|
31 | |
import com.mattunderscore.http.headers.useragent.details.application.Bot; |
32 | |
import com.mattunderscore.http.headers.useragent.details.application.ContactInformation; |
33 | |
import com.mattunderscore.http.headers.useragent.details.application.GoogleBot; |
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
class BotParser implements TokenParser |
41 | |
{ |
42 | |
BotParser() |
43 | 2 | { |
44 | 2 | } |
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
@Override |
51 | |
public String parseToken(ParsingState state) |
52 | |
{ |
53 | 0 | String remainingHeader = state.getRemaining(); |
54 | 0 | if (remainingHeader.startsWith("YandexBot")) |
55 | |
{ |
56 | 0 | String version = nextElement(remainingHeader.substring(10)); |
57 | 0 | state.addDetail(new Bot("YandexBot",version)); |
58 | 0 | remainingHeader = remainingHeader.substring(10 + version.length()); |
59 | 0 | } |
60 | 0 | else if (remainingHeader.startsWith("bingbot")) |
61 | |
{ |
62 | 0 | String version = nextElement(remainingHeader.substring(10)); |
63 | 0 | state.addDetail(new Bot("bingbot",version)); |
64 | 0 | remainingHeader = remainingHeader.substring(10 + version.length()); |
65 | 0 | } |
66 | 0 | else if (remainingHeader.startsWith("Googlebot")) |
67 | |
{ |
68 | 0 | String version = nextElement(remainingHeader.substring(10)); |
69 | 0 | state.addDetail(new GoogleBot("Googlebot",version)); |
70 | 0 | remainingHeader = remainingHeader.substring(10 + version.length()); |
71 | 0 | } |
72 | 0 | else if (remainingHeader.startsWith("+")) |
73 | |
{ |
74 | 0 | String info = nextElement(remainingHeader.substring(1)); |
75 | 0 | state.addDetail(new ContactInformation(info)); |
76 | 0 | remainingHeader = remainingHeader.substring(1 + info.length()); |
77 | |
} |
78 | 0 | if (remainingHeader.startsWith("Googlebot")) |
79 | |
{ |
80 | 0 | String version = nextElement(remainingHeader.substring(10)); |
81 | 0 | state.addDetail(new GoogleBot("Googlebot",version)); |
82 | 0 | remainingHeader = remainingHeader.substring(10 + version.length()).trim(); |
83 | 0 | state.setRemaining(remainingHeader); |
84 | 0 | contactInformation(state); |
85 | 0 | } |
86 | 0 | else if (remainingHeader.startsWith("Googlebot-Image")) |
87 | |
{ |
88 | 0 | String version = nextElement(remainingHeader.substring(10)); |
89 | 0 | state.addDetail(new GoogleBot("Googlebot-Image",version)); |
90 | 0 | remainingHeader = remainingHeader.substring(10 + version.length()).trim(); |
91 | 0 | state.setRemaining(remainingHeader); |
92 | 0 | contactInformation(state); |
93 | 0 | } |
94 | 0 | else if (remainingHeader.startsWith("Google-Site-Verification")) |
95 | |
{ |
96 | 0 | String version = nextElement(remainingHeader.substring(25)); |
97 | 0 | state.addDetail(new GoogleBot("Google-Site-Verification",version)); |
98 | 0 | remainingHeader = remainingHeader.substring(25 + version.length()).trim(); |
99 | 0 | } |
100 | 0 | else if (remainingHeader.startsWith("HubSpot Connect")) |
101 | |
{ |
102 | 0 | String version = nextElement(remainingHeader.substring(16)); |
103 | 0 | state.addDetail(new Bot("HubSpot Connect",version)); |
104 | 0 | remainingHeader = remainingHeader.substring(16 + version.length()).trim(); |
105 | 0 | state.setRemaining(remainingHeader); |
106 | 0 | contactInformation(state); |
107 | 0 | } |
108 | 0 | else if (remainingHeader.startsWith("HubSpot Links Crawler")) |
109 | |
{ |
110 | 0 | String version = nextElement(remainingHeader.substring(22)); |
111 | 0 | state.addDetail(new Bot("HubSpot Links Crawler",version)); |
112 | 0 | remainingHeader = remainingHeader.substring(22 + version.length()); |
113 | 0 | String url = nextElement(remainingHeader); |
114 | 0 | state.addDetail(new ContactInformation(url)); |
115 | 0 | remainingHeader = remainingHeader.substring(url.length()); |
116 | 0 | } |
117 | 0 | else if (remainingHeader.startsWith("ip-web-crawler.com")) |
118 | |
{ |
119 | 0 | state.addDetail(new Bot("ip-web-crawler.com")); |
120 | 0 | remainingHeader = remainingHeader.substring(18).trim(); |
121 | |
} |
122 | 0 | else if (remainingHeader.startsWith("ARC Reader")) |
123 | |
{ |
124 | 0 | state.addDetail(new Bot("ARC Reader")); |
125 | 0 | remainingHeader = remainingHeader.substring(10).trim(); |
126 | 0 | state.setRemaining(remainingHeader); |
127 | 0 | contactInformation(state); |
128 | |
} |
129 | 0 | else if (remainingHeader.startsWith("linkdex.com")) |
130 | |
{ |
131 | 0 | String version = nextElement(remainingHeader.substring(12)); |
132 | 0 | state.addDetail(new Bot("linkdex.com",version)); |
133 | 0 | remainingHeader = remainingHeader.substring(12 + version.length()).trim(); |
134 | 0 | } |
135 | 0 | else if (remainingHeader.startsWith("btbot")) |
136 | |
{ |
137 | 0 | String version = nextElement(remainingHeader.substring(6)); |
138 | 0 | state.addDetail(new Bot("btbot",version)); |
139 | 0 | remainingHeader = remainingHeader.substring(6 + version.length()).trim(); |
140 | 0 | state.setRemaining(remainingHeader); |
141 | 0 | contactInformation(state); |
142 | 0 | } |
143 | 0 | else if (remainingHeader.startsWith("semanticdiscovery")) |
144 | |
{ |
145 | 0 | String version = nextElement(remainingHeader.substring(18)); |
146 | 0 | state.addDetail(new Bot("semanticdiscovery",version)); |
147 | 0 | remainingHeader = remainingHeader.substring(18 + version.length()).trim(); |
148 | 0 | state.setRemaining(remainingHeader); |
149 | 0 | contactInformation(state); |
150 | 0 | } |
151 | 0 | else if (remainingHeader.startsWith("msnbot")) |
152 | |
{ |
153 | 0 | String version = nextElement(remainingHeader.substring(7)); |
154 | 0 | state.addDetail(new Bot("msnbot",version)); |
155 | 0 | remainingHeader = remainingHeader.substring(7 + version.length()).trim(); |
156 | 0 | state.setRemaining(remainingHeader); |
157 | 0 | contactInformation(state); |
158 | 0 | } |
159 | 0 | else if (remainingHeader.startsWith("WhatWeb")) |
160 | |
{ |
161 | 0 | String version = nextElement(remainingHeader.substring(8)); |
162 | 0 | state.addDetail(new Bot("WhatWeb",version)); |
163 | 0 | remainingHeader = remainingHeader.substring(8 + version.length()).trim(); |
164 | 0 | state.setRemaining(remainingHeader); |
165 | 0 | contactInformation(state); |
166 | 0 | } |
167 | 0 | else if (remainingHeader.startsWith("core-project")) |
168 | |
{ |
169 | 0 | String version = nextElement(remainingHeader.substring(13)); |
170 | 0 | state.addDetail(new Bot("core-project",version)); |
171 | 0 | remainingHeader = remainingHeader.substring(13 + version.length()).trim(); |
172 | 0 | } |
173 | 0 | else if (remainingHeader.startsWith("PagesInventory")) |
174 | |
{ |
175 | 0 | state.addDetail(new Bot("PagesInventory")); |
176 | 0 | remainingHeader = remainingHeader.substring(14).trim(); |
177 | |
} |
178 | 0 | else if (remainingHeader.startsWith("AppEngine-Google")) |
179 | |
{ |
180 | 0 | state.addDetail(new Bot("AppEngine-Google")); |
181 | 0 | remainingHeader = remainingHeader.substring(16).trim(); |
182 | |
} |
183 | 0 | else if (remainingHeader.startsWith("Morfeus")) |
184 | |
{ |
185 | 0 | state.addDetail(new Bot(remainingHeader)); |
186 | |
} |
187 | 0 | else if (remainingHeader.startsWith("MacInroy Privacy Auditors")) |
188 | |
{ |
189 | 0 | state.addDetail(new Bot("MacInroy Privacy Auditors")); |
190 | |
} |
191 | 0 | else if (remainingHeader.startsWith("AdnormCrawler")) |
192 | |
{ |
193 | 0 | String url = nextElement(remainingHeader.substring(14)); |
194 | 0 | state.addDetail(new Bot("AdnormCrawler")); |
195 | 0 | state.addDetail(new ContactInformation(url)); |
196 | 0 | remainingHeader = remainingHeader.substring(14 + url.length()).trim(); |
197 | 0 | } |
198 | 0 | else if (remainingHeader.startsWith("ia_archiver")) |
199 | |
{ |
200 | 0 | state.addDetail(new Bot("ia_archiver")); |
201 | 0 | remainingHeader = remainingHeader.substring(11).trim(); |
202 | |
} |
203 | 0 | else if (remainingHeader.startsWith("ZmEu")) |
204 | |
{ |
205 | 0 | state.addDetail(new Bot("ZmEu")); |
206 | 0 | remainingHeader = remainingHeader.substring(4).trim(); |
207 | |
} |
208 | 0 | else if (remainingHeader.startsWith("nutch")) |
209 | |
{ |
210 | 0 | state.addDetail(new Bot("Nutch")); |
211 | 0 | remainingHeader = remainingHeader.substring(5).trim(); |
212 | |
} |
213 | 0 | else if (remainingHeader.startsWith("panscient.com")) |
214 | |
{ |
215 | 0 | state.addDetail(new Bot("panscient.com")); |
216 | 0 | remainingHeader= remainingHeader.substring(13); |
217 | |
} |
218 | 0 | else if (remainingHeader.startsWith("webcollage")) |
219 | |
{ |
220 | 0 | String version = nextElement(remainingHeader.substring(11)); |
221 | 0 | state.addDetail(new Bot("webcollage",version)); |
222 | 0 | remainingHeader= remainingHeader.substring(11 + version.length()); |
223 | 0 | } |
224 | 0 | else if (remainingHeader.startsWith("W3C_Validator")) |
225 | |
{ |
226 | 0 | String version = nextElement(remainingHeader.substring(14)); |
227 | 0 | state.addDetail(new Bot("W3C_Validator",version)); |
228 | 0 | remainingHeader= remainingHeader.substring(14 + version.length()); |
229 | 0 | if (remainingHeader.startsWith(" http://")) |
230 | |
{ |
231 | 0 | String url = nextElement(remainingHeader.substring(1)); |
232 | 0 | state.addDetail(new ContactInformation(url)); |
233 | |
} |
234 | 0 | } |
235 | 0 | else if (remainingHeader.startsWith("Xenu Link Sleuth")) |
236 | |
{ |
237 | 0 | String version = nextElement(remainingHeader.substring(17)); |
238 | 0 | state.addDetail(new Bot("Xenu Link Sleuth",version)); |
239 | 0 | remainingHeader= remainingHeader.substring(17 + version.length()); |
240 | 0 | } |
241 | |
else |
242 | |
{ |
243 | 0 | String nextToken = nextElement(remainingHeader.substring(0)); |
244 | 0 | if (nextToken.contains("bot")) |
245 | |
{ |
246 | 0 | state.addDetail(new Bot(nextToken)); |
247 | 0 | remainingHeader = remainingHeader.substring(nextToken.length()); |
248 | 0 | state.setRemaining(remainingHeader); |
249 | 0 | contactInformation(state); |
250 | |
} |
251 | 0 | else if (nextToken.contains("Bot")) |
252 | |
{ |
253 | 0 | state.addDetail(new Bot(nextToken)); |
254 | 0 | remainingHeader = remainingHeader.substring(nextToken.length()); |
255 | 0 | state.setRemaining(remainingHeader); |
256 | 0 | contactInformation(state); |
257 | |
} |
258 | 0 | else if (nextToken.contains("BOT")) |
259 | |
{ |
260 | 0 | state.addDetail(new Bot(nextToken)); |
261 | 0 | remainingHeader = remainingHeader.substring(nextToken.length()); |
262 | 0 | state.setRemaining(remainingHeader); |
263 | 0 | contactInformation(state); |
264 | |
} |
265 | |
} |
266 | 0 | state.setRemaining(remainingHeader); |
267 | 0 | return null; |
268 | |
} |
269 | |
} |