1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.sw4j.tool.html.crawler;
20
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.io.StringWriter;
24 import java.util.LinkedList;
25 import java.util.List;
26 import org.jsoup.Jsoup;
27 import org.jsoup.nodes.Attributes;
28 import org.jsoup.nodes.Document;
29 import org.jsoup.nodes.Element;
30
31 public class SinglePageCrawler {
32
33 private final List<ExternalReferenceListener> aListener;
34
35 private final List<ExternalReferenceListener> areaListener;
36
37 private final List<ExternalReferenceListener> audioListener;
38
39 private final List<ExternalReferenceListener> baseListener;
40
41 private final List<ExternalReferenceListener> blockquoteListener;
42
43 private final List<ExternalReferenceListener> buttonListener;
44
45 private final List<ExternalReferenceListener> delListener;
46
47 private final List<ExternalReferenceListener> embedListener;
48
49 private final List<ExternalReferenceListener> formListener;
50
51 private final List<ExternalReferenceListener> iframeListener;
52
53 private final List<ExternalReferenceListener> imgListener;
54
55 private final List<ExternalReferenceListener> inputListener;
56
57 private final List<ExternalReferenceListener> insListener;
58
59 private final List<ExternalReferenceListener> itemListener;
60
61 private final List<ExternalReferenceListener> linkListener;
62
63 private final List<ExternalReferenceListener> manifestListener;
64
65 private final List<ExternalReferenceListener> objectListener;
66
67 private final List<ExternalReferenceListener> qListener;
68
69 private final List<ExternalReferenceListener> scriptListener;
70
71 private final List<ExternalReferenceListener> sourceListener;
72
73 private final List<ExternalReferenceListener> trackListener;
74
75 private final List<ExternalReferenceListener> videoListener;
76
77 private SinglePageCrawler(final List<ExternalReferenceListener> aListener,
78 final List<ExternalReferenceListener> areaListener,
79 final List<ExternalReferenceListener> audioListener,
80 final List<ExternalReferenceListener> baseListener,
81 final List<ExternalReferenceListener> blockquoteListener,
82 final List<ExternalReferenceListener> buttonListener,
83 final List<ExternalReferenceListener> embedListener,
84 final List<ExternalReferenceListener> delListener,
85 final List<ExternalReferenceListener> formListener,
86 final List<ExternalReferenceListener> iframeListener,
87 final List<ExternalReferenceListener> imgListener,
88 final List<ExternalReferenceListener> inputListener,
89 final List<ExternalReferenceListener> insListener,
90 final List<ExternalReferenceListener> itemListener,
91 final List<ExternalReferenceListener> linkListener,
92 final List<ExternalReferenceListener> manifestListener,
93 final List<ExternalReferenceListener> objectListener,
94 final List<ExternalReferenceListener> qListener,
95 final List<ExternalReferenceListener> scriptListener,
96 final List<ExternalReferenceListener> sourceListener,
97 final List<ExternalReferenceListener> trackListener,
98 final List<ExternalReferenceListener> videoListener) {
99 this.aListener = aListener;
100 this.areaListener = areaListener;
101 this.audioListener = audioListener;
102 this.baseListener = baseListener;
103 this.blockquoteListener = blockquoteListener;
104 this.buttonListener = buttonListener;
105 this.delListener = delListener;
106 this.embedListener = embedListener;
107 this.formListener = formListener;
108 this.iframeListener = iframeListener;
109 this.imgListener = imgListener;
110 this.inputListener = inputListener;
111 this.insListener = insListener;
112 this.itemListener = itemListener;
113 this.linkListener = linkListener;
114 this.manifestListener = manifestListener;
115 this.objectListener = objectListener;
116 this.qListener = qListener;
117 this.scriptListener = scriptListener;
118 this.sourceListener = sourceListener;
119 this.trackListener = trackListener;
120 this.videoListener = videoListener;
121 }
122
123 public static Builder builder() {
124 return new BuilderImpl();
125 }
126
127 public void parse(String content, String baseUri) {
128 Document document = Jsoup.parse(content);
129 for (Element child: document.head().children()) {
130 if ("link".equals(child.nodeName())) {
131 handleLink(child);
132 }
133 }
134 for (Element child: document.body().children()) {
135 }
136 }
137
138 public void parse(Reader content, String baseUri) throws IOException {
139 StringWriter stringWriter = new StringWriter();
140 content.transferTo(stringWriter);
141 parse(stringWriter.toString(), baseUri);
142 }
143
144 private void handleLink(Element link) {
145 Attributes attributes = link.attributes();
146 String[] rels = attributes.getIgnoreCase("rel").split("\\s");
147 boolean alternate = false;
148 for (String rel: rels) {
149 switch (rel) {
150 case "alternate":
151 alternate = true;
152 break;
153 case "stylesheet":
154 String href = attributes.get("href");
155 RelationType type = RelationType.STYLESHEET;
156 if (alternate) {
157 type = RelationType.ALTERNATE_STYLESHEET;
158 alternate = false;
159 }
160 ExternalReferenceEventenceEvent.html#ExternalReferenceEvent">ExternalReferenceEvent event = new ExternalReferenceEvent(link.tagName(), LinkType.EXTERNAL_RESOURCE,
161 href, type);
162 for (ExternalReferenceListener listener : this.linkListener) {
163 listener.link(event);
164 }
165 break;
166 default:
167 break;
168 }
169 }
170 }
171
172
173 public interface Builder {
174
175
176
177
178
179
180
181 Builder addAListener(ExternalReferenceListener listener);
182
183
184
185
186
187
188
189 Builder addAreaListener(ExternalReferenceListener listener);
190
191
192
193
194
195
196
197 Builder addAudioListener(ExternalReferenceListener listener);
198
199
200
201
202
203
204
205 Builder addBaseListener(ExternalReferenceListener listener);
206
207
208
209
210
211
212
213 Builder addBlockquoteListener(ExternalReferenceListener listener);
214
215
216
217
218
219
220
221 Builder addButtonListener(ExternalReferenceListener listener);
222
223
224
225
226
227
228
229 Builder addDelListener(ExternalReferenceListener listener);
230
231
232
233
234
235
236
237 Builder addEmbedListener(ExternalReferenceListener listener);
238
239
240
241
242
243
244
245 Builder addFormListener(ExternalReferenceListener listener);
246
247
248
249
250
251
252
253 Builder addIframeListener(ExternalReferenceListener listener);
254
255
256
257
258
259
260
261 Builder addImgListener(ExternalReferenceListener listener);
262
263
264
265
266
267
268
269
270 Builder addInputListener(ExternalReferenceListener listener);
271
272
273
274
275
276
277
278 Builder addInsListener(ExternalReferenceListener listener);
279
280
281
282
283
284
285
286
287 Builder addItemListener(ExternalReferenceListener listener);
288
289
290
291
292
293
294
295 Builder addLinkListener(ExternalReferenceListener listener);
296
297
298
299
300
301
302
303 Builder addManifestListener(ExternalReferenceListener listener);
304
305
306
307
308
309
310
311 Builder addObjectListener(ExternalReferenceListener listener);
312
313
314
315
316
317
318
319 Builder addQListener(ExternalReferenceListener listener);
320
321
322
323
324
325
326
327 Builder addScriptListener(ExternalReferenceListener listener);
328
329
330
331
332
333
334
335 Builder addSourceListener(ExternalReferenceListener listener);
336
337
338
339
340
341
342
343 Builder addTrackListener(ExternalReferenceListener listener);
344
345
346
347
348
349
350
351 Builder addVideoListener(ExternalReferenceListener listener);
352
353 SinglePageCrawler build();
354
355 }
356
357
358 private static class BuilderImpl implements Builder {
359
360 private final List<ExternalReferenceListener> aListener = new LinkedList<>();
361
362 private final List<ExternalReferenceListener> areaListener = new LinkedList<>();
363
364 private final List<ExternalReferenceListener> audioListener = new LinkedList<>();
365
366 private final List<ExternalReferenceListener> baseListener = new LinkedList<>();
367
368 private final List<ExternalReferenceListener> blockquoteListener = new LinkedList<>();
369
370 private final List<ExternalReferenceListener> buttonListener = new LinkedList<>();
371
372 private final List<ExternalReferenceListener> delListener = new LinkedList<>();
373
374 private final List<ExternalReferenceListener> embedListener = new LinkedList<>();
375
376 private final List<ExternalReferenceListener> formListener = new LinkedList<>();
377
378 private final List<ExternalReferenceListener> iframeListener = new LinkedList<>();
379
380 private final List<ExternalReferenceListener> imgListener = new LinkedList<>();
381
382 private final List<ExternalReferenceListener> inputListener = new LinkedList<>();
383
384 private final List<ExternalReferenceListener> insListener = new LinkedList<>();
385
386 private final List<ExternalReferenceListener> itemListener = new LinkedList<>();
387
388 private final List<ExternalReferenceListener> linkListener = new LinkedList<>();
389
390 private final List<ExternalReferenceListener> manifestListener = new LinkedList<>();
391
392 private final List<ExternalReferenceListener> objectListener = new LinkedList<>();
393
394 private final List<ExternalReferenceListener> qListener = new LinkedList<>();
395
396 private final List<ExternalReferenceListener> scriptListener = new LinkedList<>();
397
398 private final List<ExternalReferenceListener> sourceListener = new LinkedList<>();
399
400 private final List<ExternalReferenceListener> trackListener = new LinkedList<>();
401
402 private final List<ExternalReferenceListener> videoListener = new LinkedList<>();
403
404 private BuilderImpl() {}
405
406 @Override
407 public Builder addAListener(final ExternalReferenceListener listener) {
408 this.aListener.add(listener);
409 return this;
410 }
411
412 @Override
413 public Builder addAreaListener(final ExternalReferenceListener listener) {
414 this.areaListener.add(listener);
415 return this;
416 }
417
418 @Override
419 public Builder addAudioListener(final ExternalReferenceListener listener) {
420 this.audioListener.add(listener);
421 return this;
422 }
423
424 @Override
425 public Builder addBaseListener(final ExternalReferenceListener listener) {
426 this.baseListener.add(listener);
427 return this;
428 }
429
430 @Override
431 public Builder addBlockquoteListener(final ExternalReferenceListener listener) {
432 this.blockquoteListener.add(listener);
433 return this;
434 }
435
436 @Override
437 public Builder addButtonListener(final ExternalReferenceListener listener) {
438 this.buttonListener.add(listener);
439 return this;
440 }
441
442 @Override
443 public Builder addDelListener(final ExternalReferenceListener listener) {
444 this.delListener.add(listener);
445 return this;
446 }
447
448 @Override
449 public Builder addEmbedListener(final ExternalReferenceListener listener) {
450 this.embedListener.add(listener);
451 return this;
452 }
453
454 @Override
455 public Builder addIframeListener(final ExternalReferenceListener listener) {
456 this.iframeListener.add(listener);
457 return this;
458 }
459
460 @Override
461 public Builder addImgListener(final ExternalReferenceListener listener) {
462 this.imgListener.add(listener);
463 return this;
464 }
465
466 @Override
467 public Builder addInputListener(final ExternalReferenceListener listener) {
468 this.inputListener.add(listener);
469 return this;
470 }
471
472 @Override
473 public Builder addInsListener(final ExternalReferenceListener listener) {
474 this.insListener.add(listener);
475 return this;
476 }
477
478 @Override
479 public Builder addItemListener(final ExternalReferenceListener listener) {
480 this.itemListener.add(listener);
481 return this;
482 }
483
484 @Override
485 public Builder addLinkListener(final ExternalReferenceListener listener) {
486 this.linkListener.add(listener);
487 return this;
488 }
489
490 @Override
491 public Builder addManifestListener(final ExternalReferenceListener listener) {
492 this.manifestListener.add(listener);
493 return this;
494 }
495
496 @Override
497 public Builder addObjectListener(final ExternalReferenceListener listener) {
498 this.objectListener.add(listener);
499 return this;
500 }
501
502 @Override
503 public Builder addQListener(final ExternalReferenceListener listener) {
504 this.qListener.add(listener);
505 return this;
506 }
507
508 @Override
509 public Builder addScriptListener(final ExternalReferenceListener listener) {
510 this.scriptListener.add(listener);
511 return this;
512 }
513
514 @Override
515 public Builder addSourceListener(final ExternalReferenceListener listener) {
516 this.sourceListener.add(listener);
517 return this;
518 }
519
520 @Override
521 public Builder addTrackListener(final ExternalReferenceListener listener) {
522 this.trackListener.add(listener);
523 return this;
524 }
525
526 @Override
527 public Builder addVideoListener(final ExternalReferenceListener listener) {
528 this.videoListener.add(listener);
529 return this;
530 }
531
532 @Override
533 public Builder addFormListener(final ExternalReferenceListener listener) {
534 this.formListener.add(listener);
535 return this;
536 }
537
538 public SinglePageCrawler build() {
539 return new SinglePageCrawler(this.aListener, this.areaListener, this.audioListener, this.baseListener,
540 this.blockquoteListener, this.buttonListener, this.delListener, this.embedListener,
541 this.formListener, this.iframeListener, this.imgListener, this.inputListener, this.insListener,
542 this.itemListener, this.linkListener, this.manifestListener, this.objectListener, this.qListener,
543 this.scriptListener, this.sourceListener, this.trackListener, this.videoListener);
544 }
545
546 }
547
548 }