How to built a Simple Web Crawler in PHP

The web crawler is a computer program which used to collect/crawling following key values(HREF links, Image links, Meta Data .etc) from given website URL. It is designed like intelligent to follow different HREF links which are already fetched from the previous URL, so in this way, Crawler can jump from one website to other websites. Usually, it called as a Web spider or Web Bot. This mechanism always acts as a backbone of the Web search engine.

[Download Source code]

config.php
1 <?php
2 //---- Href Element Syntax Start ----
3 $hrefTag = array();
4 $hrefTag[0] = "<a";
5 $hrefTag[1] = "href";
6 $hrefTag[2] = "=";
7 $hrefTag[3] = ">";
8 $hrefTagCountStart = 0;
9 $hrefTagCountFinal = count($hrefTag);
10 $hrefTagLengthStart = 0;
11 $hrefTagLengthFinal = strlen($hrefTag[0]);
12 $hrefTagPointer =& $hrefTag[0];
13 //---- Href Element Syntax End ----
14
15 //---- Image Element Syntax Start ----
16 $imgTag = array();
17 $imgTag[0] = "<img";
18 $imgTag[1] = "src";
19 $imgTag[2] = "=";
20 $imgTag[3] = ">";
21 $imgTagCountStart = 0;
22 $imgTagCountFinal = count($imgTag);
23 $imgTagLengthStart = 0;
24 $imgTagLengthFinal = strlen($imgTag[0]);
25 $imgTagPointer =& $imgTag[0];
26 //---- Image Element Syntax End ----
27
28 //---- Valid Domain Start -----
29 $Url_Extensions = array("asp","aspx","html","htm","php","php3","biz","com","edu","gov","info","int","jobs",
30 "net","org","in","us","uk");
31 //---- Valid Domain End -------
32
33 //---- Valid File Extension Start ----
34 $Document_Extensions = array("doc","pdf","ppt","txt");
35 $Image_Extensions = array("gif","jpeg","jpg","png");
36 //---- Valid File Extension End ------
37
38 //---- Curl Parameters Start ----
39 $crawlOptions = array(
40 CURLOPT_RETURNTRANSFER => true, // return web page
41 CURLOPT_HEADER => false, // don't return headers
42 CURLOPT_FOLLOWLOCATION => true, // follow redirects
43 CURLOPT_ENCODING => "", // handle all encodings
44 CURLOPT_USERAGENT => "AlgoberryBot", // who am i
45 CURLOPT_AUTOREFERER => true, // set referer on redirect
46 CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
47 CURLOPT_TIMEOUT => 120, // timeout on response
48 CURLOPT_MAXREDIRS => 0, // stop after 10 redirects
49 );
50 //---- Curl Parameters End ------
51 ?>

crawler.php
1 <?php
2 class webCrawler
3 {
4 public $siteURL;
5 public $error;
6
7 function __construct()
8 {
9 $this->siteURL = "";
10 $this->error = "";
11 }
12
13 function parser()
14 {
15 global $hrefTag,$hrefTagCountStart,$hrefTagCountFinal,$hrefTagLengthStart,$hrefTagLengthFinal,$hrefTagPointer;
16 global $imgTag,$imgTagCountStart,$imgTagCountFinal,$imgTagLengthStart,$imgTagLengthFinal,$imgTagPointer;
17 global $Url_Extensions,$Document_Extensions,$Image_Extensions,$crawlOptions;
18
19 $dotCount = 0;
20 $slashCount = 0;
21 $singleSlashCount = 0;
22 $doubleSlashCount = 0;
23 $parentDirectoryCount = 0;
24
25 $linkBuffer = array();
26
27 if(($url = trim($this->siteURL)) != "")
28 {
29 $crawlURL = rtrim($url,"/");
30 if(($directoryURL = dirname($crawlURL)) == "http:")
31 { $directoryURL = $crawlURL; }
32 $urlParser = preg_split("/\//",$crawlURL);
33
34 //-- Curl Start --
35 $curlObject = curl_init($crawlURL);
36 curl_setopt_array($curlObject,$crawlOptions);
37 $webPageContent = curl_exec($curlObject);
38 $errorNumber = curl_errno($curlObject);
39 curl_close($curlObject);
40 //-- Curl End --
41
42 if($errorNumber == 0)
43 {
44 $webPageCounter = 0;
45 $webPageLength = strlen($webPageContent);
46 while($webPageCounter < $webPageLength)
47 {
48 $character = $webPageContent[$webPageCounter];
49 if($character == "")
50 {
51 $webPageCounter++;
52 continue;
53 }
54 $character = strtolower($character);
55 //-- Href Filter Start --
56 if($hrefTagPointer[$hrefTagLengthStart] == $character)
57 {
58 $hrefTagLengthStart++;
59 if($hrefTagLengthStart == $hrefTagLengthFinal)
60 {
61 $hrefTagCountStart++;
62 if($hrefTagCountStart == $hrefTagCountFinal)
63 {
64 if($hrefURL != "")
65 {
66 if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
67 {
68 if($doubleSlashCount >= 1)
69 { $hrefURL = "http://".$hrefURL; }
70 else if($parentDirectoryCount >= 1)
71 {
72 $tempData = 0;
73 $tempString = "";
74 $tempTotal = count($urlParser) - $parentDirectoryCount;
75 while($tempData < $tempTotal)
76 {
77 $tempString .= $urlParser[$tempData]."/";
78 $tempData++;
79 }
80 $hrefURL = $tempString."".$hrefURL;
81 }
82 else if($singleSlashCount >= 1)
83 { $hrefURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$hrefURL; }
84 }
85 $host = "";
86 $hrefURL = urldecode($hrefURL);
87 $hrefURL = rtrim($hrefURL,"/");
88 if(filter_var($hrefURL,FILTER_VALIDATE_URL) == true)
89 {
90 $dump = parse_url($hrefURL);
91 if(isset($dump["host"]))
92 { $host = trim(strtolower($dump["host"])); }
93 }
94 else
95 {
96 $hrefURL = $directoryURL."/".$hrefURL;
97 if(filter_var($hrefURL,FILTER_VALIDATE_URL) == true)
98 {
99 $dump = parse_url($hrefURL);
100 if(isset($dump["host"]))
101 { $host = trim(strtolower($dump["host"])); }
102 }
103 }
104 if($host != "")
105 {
106 $extension = pathinfo($hrefURL,PATHINFO_EXTENSION);
107 if($extension != "")
108 {
109 $tempBuffer ="";
110 $extensionlength = strlen($extension);
111 for($tempData = 0; $tempData < $extensionlength; $tempData++)
112 {
113 if($extension[$tempData] != "?")
114 {
115 $tempBuffer = $tempBuffer.$extension[$tempData];
116 continue;
117 }
118 else
119 {
120 $extension = trim($tempBuffer);
121 break;
122 }
123 }
124 if(in_array($extension,$Url_Extensions))
125 { $type = "domain"; }
126 else if(in_array($extension,$Image_Extensions))
127 { $type = "image"; }
128 else if(in_array($extension,$Document_Extensions))
129 { $type = "document"; }
130 else
131 { $type = "unknown"; }
132 }
133 else
134 { $type = "domain"; }
135
136 if($hrefURL != "")
137 {
138 if($type == "domain" && !in_array($hrefURL,$this->linkBuffer["domain"]))
139 { $this->linkBuffer["domain"][] = $hrefURL; }
140 if($type == "image" && !in_array($hrefURL,$this->linkBuffer["image"]))
141 { $this->linkBuffer["image"][] = $hrefURL; }
142 if($type == "document" && !in_array($hrefURL,$this->linkBuffer["document"]))
143 { $this->linkBuffer["document"][] = $hrefURL; }
144 if($type == "unknown" && !in_array($hrefURL,$this->linkBuffer["unknown"]))
145 { $this->linkBuffer["unknown"][] = $hrefURL; }
146 }
147 }
148 }
149 $hrefTagCountStart = 0;
150 }
151 if($hrefTagCountStart == 3)
152 {
153 $hrefURL = "";
154 $dotCount = 0;
155 $slashCount = 0;
156 $singleSlashCount = 0;
157 $doubleSlashCount = 0;
158 $parentDirectoryCount = 0;
159 $webPageCounter++;
160 while($webPageCounter < $webPageLength)
161 {
162 $character = $webPageContent[$webPageCounter];
163 if($character == "")
164 {
165 $webPageCounter++;
166 continue;
167 }
168 if($character == "\"" || $character == "'")
169 {
170 $webPageCounter++;
171 while($webPageCounter < $webPageLength)
172 {
173 $character = $webPageContent[$webPageCounter];
174 if($character == "")
175 {
176 $webPageCounter++;
177 continue;
178 }
179 if($character == "\"" || $character == "'" || $character == "#")
180 {
181 $webPageCounter--;
182 break;
183 }
184 else if($hrefURL != "")
185 { $hrefURL .= $character; }
186 else if($character == "." || $character == "/")
187 {
188 if($character == ".")
189 {
190 $dotCount++;
191 $slashCount = 0;
192 }
193 else if($character == "/")
194 {
195 $slashCount++;
196 if($dotCount == 2 && $slashCount == 1)
197 $parentDirectoryCount++;
198 else if($dotCount == 0 && $slashCount == 1)
199 $singleSlashCount++;
200 else if($dotCount == 0 && $slashCount == 2)
201 $doubleSlashCount++;
202 $dotCount = 0;
203 }
204 }
205 else
206 { $hrefURL .= $character; }
207 $webPageCounter++;
208 }
209 break;
210 }
211 $webPageCounter++;
212 }
213 }
214 $hrefTagLengthStart = 0;
215 $hrefTagLengthFinal = strlen($hrefTag[$hrefTagCountStart]);
216 $hrefTagPointer =& $hrefTag[$hrefTagCountStart];
217 }
218 }
219 else
220 { $hrefTagLengthStart = 0; }
221 //-- Href Filter End --
222 //-- Image Filter Start --
223 if($imgTagPointer[$imgTagLengthStart] == $character)
224 {
225 $imgTagLengthStart++;
226 if($imgTagLengthStart == $imgTagLengthFinal)
227 {
228 $imgTagCountStart++;
229 if($imgTagCountStart == $imgTagCountFinal)
230 {
231 if($imgURL != "")
232 {
233 if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
234 {
235 if($doubleSlashCount >= 1)
236 { $imgURL = "http://".$imgURL; }
237 else if($parentDirectoryCount >= 1)
238 {
239 $tempData = 0;
240 $tempString = "";
241 $tempTotal = count($urlParser) - $parentDirectoryCount;
242 while($tempData < $tempTotal)
243 {
244 $tempString .= $urlParser[$tempData]."/";
245 $tempData++;
246 }
247 $imgURL = $tempString."".$imgURL;
248 }
249 else if($singleSlashCount >= 1)
250 { $imgURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$imgURL; }
251 }
252 $host = "";
253 $imgURL = urldecode($imgURL);
254 $imgURL = rtrim($imgURL,"/");
255 if(filter_var($imgURL,FILTER_VALIDATE_URL) == true)
256 {
257 $dump = parse_url($imgURL);
258 $host = trim(strtolower($dump["host"]));
259 }
260 else
261 {
262 $imgURL = $directoryURL."/".$imgURL;
263 if(filter_var($imgURL,FILTER_VALIDATE_URL) == true)
264 {
265 $dump = parse_url($imgURL);
266 $host = trim(strtolower($dump["host"]));
267 }
268 }
269 if($host != "")
270 {
271 $extension = pathinfo($imgURL,PATHINFO_EXTENSION);
272 if($extension != "")
273 {
274 $tempBuffer ="";
275 $extensionlength = strlen($extension);
276 for($tempData = 0; $tempData < $extensionlength; $tempData++)
277 {
278 if($extension[$tempData] != "?")
279 {
280 $tempBuffer = $tempBuffer.$extension[$tempData];
281 continue;
282 }
283 else
284 {
285 $extension = trim($tempBuffer);
286 break;
287 }
288 }
289 if(in_array($extension,$Url_Extensions))
290 { $type = "domain"; }
291 else if(in_array($extension,$Image_Extensions))
292 { $type = "image"; }
293 else if(in_array($extension,$Document_Extensions))
294 { $type = "document"; }
295 else
296 { $type = "unknown"; }
297 }
298 else
299 { $type = "domain"; }
300
301 if($imgURL != "")
302 {
303 if($type == "domain" && !in_array($imgURL,$this->linkBuffer["domain"]))
304 { $this->linkBuffer["domain"][] = $imgURL; }
305 if($type == "image" && !in_array($imgURL,$this->linkBuffer["image"]))
306 { $this->linkBuffer["image"][] = $imgURL; }
307 if($type == "document" && !in_array($imgURL,$this->linkBuffer["document"]))
308 { $this->linkBuffer["document"][] = $imgURL; }
309 if($type == "unknown" && !in_array($imgURL,$this->linkBuffer["unknown"]))
310 { $this->linkBuffer["unknown"][] = $imgURL; }
311 }
312 }
313 }
314 $imgTagCountStart = 0;
315 }
316 if($imgTagCountStart == 3)
317 {
318 $imgURL = "";
319 $dotCount = 0;
320 $slashCount = 0;
321 $singleSlashCount = 0;
322 $doubleSlashCount = 0;
323 $parentDirectoryCount = 0;
324 $webPageCounter++;
325 while($webPageCounter < $webPageLength)
326 {
327 $character = $webPageContent[$webPageCounter];
328 if($character == "")
329 {
330 $webPageCounter++;
331 continue;
332 }
333 if($character == "\"" || $character == "'")
334 {
335 $webPageCounter++;
336 while($webPageCounter < $webPageLength)
337 {
338 $character = $webPageContent[$webPageCounter];
339 if($character == "")
340 {
341 $webPageCounter++;
342 continue;
343 }
344 if($character == "\"" || $character == "'" || $character == "#")
345 {
346 $webPageCounter--;
347 break;
348 }
349 else if($imgURL != "")
350 { $imgURL .= $character; }
351 else if($character == "." || $character == "/")
352 {
353 if($character == ".")
354 {
355 $dotCount++;
356 $slashCount = 0;
357 }
358 else if($character == "/")
359 {
360 $slashCount++;
361 if($dotCount == 2 && $slashCount == 1)
362 $parentDirectoryCount++;
363 else if($dotCount == 0 && $slashCount == 1)
364 $singleSlashCount++;
365 else if($dotCount == 0 && $slashCount == 2)
366 $doubleSlashCount++;
367 $dotCount = 0;
368 }
369 }
370 else
371 { $imgURL .= $character; }
372 $webPageCounter++;
373 }
374 break;
375 }
376 $webPageCounter++;
377 }
378 }
379 $imgTagLengthStart = 0;
380 $imgTagLengthFinal = strlen($imgTag[$imgTagCountStart]);
381 $imgTagPointer =& $imgTag[$imgTagCountStart];
382 }
383 }
384 else
385 { $imgTagLengthStart = 0; }
386 //-- Image Filter End --
387 $webPageCounter++;
388 }
389 }
390 else
391 { $this->error = "Unable to proceed, permission denied"; }
392 }
393 else
394 { $this->error = "Please enter url"; }
395
396 if($this->error != "")
397 { $this->linkBuffer["error"] = $this->error; }
398
399 return $this->linkBuffer;
400 }
401 }
402 ?>

index.php
1 <?php
2 include_once("config.php");
3 include_once("crawler.php");
4 ?>
5 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
6 <html>
7 <head>
8 <title>Web Crawler</title>
9 <style>
10 .textbox
11 {
12 border: medium none;
13 font-family: Arial,Sans-Serif;
14 font-size: 16px;
15 height:28px;
16 line-height: 24px;
17 width: 578px;
18 border: 1px solid #ACBABD;
19 padding:5px;
20 }
21 .submitbox
22 {
23 height:40px;
24 width:80px;
25 text-align:center;
26 }
27 </style>
28 </head>
29 <body>
30 <div style="padding:10px;">
31 <div>
32 <form name="crawlsearch" method="post" action="index.php">
33 <table>
34 <tr>
35 <td align="center" colspan="2"><b>Web Crawl</b></td>
36 </tr>
37 <tr>
38 <td>
39 <input class="textbox" type="text" placeholder="Enter URL" name="url" value="<?php if(isset($_POST["url"])) { echo $_POST["url"]; } ?>">
40 </td>
41 <td>
42 <input class="submitbox" type="submit" name= "SubmitBox" value="Crawl">
43 </td>
44 </tr>
45 </table>
46 </form>
47 </div>
48 </div>
49 <div style="padding:10px;">
50 <?php
51 if(isset($_POST["SubmitBox"]))
52 {
53 $obj = new webCrawler();
54 $obj->siteURL = $_POST["url"];
55 $returnData = $obj->parser();
56 echo "<pre>";
57 print_r($returnData);
58 echo "</pre>";
59 }
60 ?>
61 </div>
62 </body>
63 <html>

0 Comments

Leave a Comment:

(Your email address will not be published. Required fields are marked *)

Name *

Email *

Message *

Social Media