Ajax link checker

Teensweb

New Member
Messages
352
Reaction score
1
Points
0
I found this cool jquery pluggin while searching the pluggins casually at the jquery site. It might be very usefull for web-admins.Here's the code:
PHP:
<?php
/**
 * jQuery Link Checker 
 * 
 * http://troy.dyle.net
 * Created for: http://russianwebstudio.com
 * Copyright (c) 2007 Anton Sidashin
 * troy [at] simix.ru  
 *
 */


error_reporting(E_ALL);

// this function taken from Drupal ( drupal.org )
function json($var) {
  switch (gettype($var)) {
    case 'boolean':
      return $var ? 'true' : 'false'; // Lowercase necessary!
    case 'integer':
    case 'double':
      return $var;
    case 'resource':
    case 'string':
      return '"'. str_replace(array("\r", "\n", "<", ">", "&"),
                              array('\r', '\n', '\x3c', '\x3e', '\x26'),
                              addslashes($var)) .'"';
    case 'array':    
      if (empty ($var) || array_keys($var) === range(0, sizeof($var) - 1)) {
        $output = array();
        foreach ($var as $v) {
          $output[] = json($v);
        }
        return '[ '. implode(', ', $output) .' ]';
      }
      // Otherwise, fall through to convert the array as an object.
    case 'object':
      $output = array();
      foreach ($var as $k => $v) {
        $output[] = json(strval($k)) .': '. json($v);
      }
      return '{ '. implode(', ', $output) .' }';
    default:
      return 'null';
  }
}

function unique_urls($urls) {
  $uurls = array();
  for ($i=0; isset($urls[$i]); $i++) {   
    if (!in_array($urls[$i], $uurls)) {
      $uurls[] = $urls[$i];
    }
  }
  return $uurls;
}


function get_head($url, $timeout = 3) {

  $info = @parse_url($url);
  $fp = @fsockopen($info["host"], 80, $errno, $errstr, $timeout);

  if (!$fp) {
     return false;
  }
  // Checks the path is not empty
  if (empty($info["path"])) {
    // If it is empty it fills it
    $info["path"] = "/";
  }
  $query = ""; 
    
  if (isset( $info["query"] ) ) {
    $query = "?".$info["query"]."";
  }
  
  $out  = "HEAD ".$info["path"]."".$query." HTTP/1.0\r\n";
  $out .= "Host: ".$info["host"]."\r\n";
  $out .= "Connection: close \r\n" ;
  $out .= "User-Agent: jQuery_LinkChecker/1.1\r\n\r\n";
  
  // write the headers out
  fwrite($fp, $out);
  $html = '';
  

  while (!feof($fp) ) {
    $html .= fread($fp,8192);
  }
  //echo $html . '<br><br>';
  //flush();
  // Closes socket
  fclose( $fp );
  
  return $html;
}

// Get status code
function get_status($header) {

  $headers = explode( "\r\n", $header );
  unset( $header );
  
  if (preg_match("/HTTP\/[0-9A-Za-z +]/i" ,$headers[0])) {   
    $status = preg_replace( "/http\/[0-9]\.[0-9]/i", "", $headers[0] );
    return $status;
  } else {
    return 'Unknown status';
  }
  
}

function url_exists($url, $timeout = 3) {
  $html = get_head($url, $timeout);
  if(empty($html)) {
    return false;
  }   

  $status = get_status($html);
  
  if(strpos($status, '200 OK') !== FALSE) {
    return true;
  }
    
  return false;
}

 

if(isset($_GET['links'])) {  
  $links = unique_urls($_GET['links']);
  $timeout = (int) $_GET['timeout'];
  
  $result = Array();
  foreach ($links as $l) {
    $result[] = array('href'=>rtrim($l, '/\\') , 'status'=>url_exists($l, $timeout) ? 'active' : 'inactive');
  }

  echo json($result);
} 
?>
JS:
Code:
/**
 * @author Anton Sidashin ( troy at simix dot ru )
 */
 

jQuery.fn.linkChecker = function(settings) {  
    if(!this.length) return;
    settings = jQuery.extend({                                                                
                                                            linksAtOnce: 2,
                                                            checkScript: 'checklinks.php',
                                                            activeClass: 'active',
                                                            inactiveClass: 'inactive',
                                                            timeout: 3
                                                        }, settings);
    var urls = Array();
    this.each( function() { 
        urls.push(this.href);
    } );    
    
    while(urls.length) {
        linkSlice = Array();
        for(var i = 0; i<settings.linksAtOnce; i++) {
            if(urls.length) {
             linkSlice.push(urls.shift());
            }
        }
        checkLinks(linkSlice, settings, this);
    }
    
    
    function checkLinks(urls, settings, jLinks) {     
        jQuery.getJSON(settings.checkScript, {'links[]':urls, 'timeout':settings.timeout}, function(links){
            for(var i = 0; i<links.length; i++) {             
                jLinks.filter('[href^='+ links[i].href + ']').addClass(links[i].status == 'active' ? settings.activeClass : settings.inactiveClass);
            }
        });                
    }
}
Finally, html:
HTML:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
    <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
    <head>
        <title>RussianWebStudio.com</title>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <link href="style.css" rel="stylesheet" type="text/css" />        
<script Language="JavaScript"> function getXMLHTTPRequest() { try { req = new XMLHttpRequest(); } catch(err1) { try { req = new ActiveXObject("Msxml2.XMLHTTP"); } catch (err2) { try { req = new ActiveXObject("Microsoft.XMLHTTP"); } catch (err3) { req = false; } } } return req; } var http = getXMLHTTPRequest(); function checklinks() { var txt = ''; var j = document.getElementsByTagName('a').length; for(var i = 0; i < (j-1); i++) // iterate through the links { lnk = document.getElementsByTagName('a')[i]; txt = txt + lnk.href + "|"; // build the link list } lnk = document.getElementsByTagName('a')[j-1]; txt = txt + lnk.href; // no divider after last one var myurl = 'linkchecker.php?list='; myRand = parseInt(Math.random()*999999999999999); var modurl = myurl+txt+"&rand="+myRand; http.open("GET", modurl, true); http.onreadystatechange = useHttpResponse; http.send(null); } function useHttpResponse() { if (http.readyState == 4) { if(http.status != 200) { alert('Link Check Problem'); } } } </script> 
    
    <style>
        
        .inactive {
            color: #33ccff;
            background: #ccffff;
        }
        .active {
            color: #ffcc66;
            background: #ffffcc;
        }
        </style>
        
        </head>
    <body>
<h1>jQuery Ajax Link Checker Demo</h1>
<p>Just view source of that file to see example usage. Blue means active link, red means inactive</p>

<li><a href="http://www.microsoft.com/en/us/default.aspx" class="goto">microsoft.com</a></li>
<li><a href="http://37signaghls.com/" class="goto">37signals.com</a> (99.9% online, I think :)</li>
<li><a href="http://unexistenthost.fm/" class="goto">unexistenthost.fm</a> (will be 100% offline)</li>
<li><a href="http://russianwebstudio.com/" class="goto">russianwebstudio.com</a></li>
<li><a href="http://sidashin.ru/" class="goto">sidashin.ru</a></li>


</body>
</html>
But there's a slight problem, the link checker does not work correctly with 404 pages. Is there anyway to do that?
 

misson

Community Paragon
Community Support
Messages
2,572
Reaction score
72
Points
48
But there's a slight problem, the link checker does not work correctly with 404 pages. Is there anyway to do that?

What's correct behavior? It looks like the script reports a 404 as a failure, which seems correct to me. What doesn't seem correct is it treats redirects (3xx status codes) as failures. Should you wish to change the behavior, look to url_exists, which is the function that checks the status.

The code could use some other alterations. Function json and unique_urls replicate PHP's own json_encode and array_unique, respectively. json_encode is not available in PHP4, true, but PHP4 is getting rarer and rarer. To handle PHP4, define a json_encode if and only if one doesn't already exist.

Even better would be to drop the PHP script and have the JS script do all the work. There's no reason to have the server perform the extra processing & requests. The client is already making extra requests to the server, so you might as well have the client request the pages directly.
 

Teensweb

New Member
Messages
352
Reaction score
1
Points
0
Thanx for the help mission. But the actual problem is that I've a custom 404 page. So it doesn't detect that. Is there some work-around for that?. And also, this is just intended to be helpfull to me, as I am the admin of my site. And it doesn't affect the server load as I would do it only once or twice a week on all pages to ensure that no link's broken. Of course, if the javascript alone can do it, it is also fine. But i don't know how to do that. So is there a simple change available that can detect custom 404 pages also?
 

misson

Community Paragon
Community Support
Messages
2,572
Reaction score
72
Points
48
Thanx for the help mission. But the actual problem is that I've a custom 404 page. So it doesn't detect that. Is there some work-around for that?.
If properly configured, custom error pages shouldn't affect the script, for it never retrieves the page itself, only the HTTP headers. Given the limited information you provided, there's no way of knowing what's going wrong. How did you set up custom error pages? How are they breaking the link checker? Does the 404 page not return a 404 status? If not, you need to fix it so a 404 page results in a 404 status.

And also, this is just intended to be helpfull to me, as I am the admin of my site. And it doesn't affect the server load as I would do it only once or twice a week on all pages to ensure that no link's broken. Of course, if the javascript alone can do it, it is also fine. But i don't know how to do that.
To filter out duplicate URLs, the the URLs as properties of an object, thus treating the object as an associative array. You can specify the "HEAD" method when calling XMLHttpRequest::open(). XMLHttpRequest::status contains the HTTP response status. The rest of the PHP script is either unnecessary or just framework for those three tasks.
 

Teensweb

New Member
Messages
352
Reaction score
1
Points
0
"If properly configured, custom error pages shouldn't affect the script, for it never retrieves the page itself, only the HTTP headers. Given the limited information you provided, there's no way of knowing what's going wrong. How did you set up custom error pages? How are they breaking the link checker? Does the 404 page not return a 404 status? If not, you need to fix it so a 404 page results in a 404 status"
I did it with the .htaccess file. I don't know what you mean by "Does the 404 page not return a 404 status"(m quite new to php) But one thing's for sure, if you goto any non exisiting pages like http://www.teensweb.co.cc/awdasda , it shows the 404 page but that link checker says that url is valid. Can you help me with that?
 
Last edited:

misson

Community Paragon
Community Support
Messages
2,572
Reaction score
72
Points
48
I don't know what you mean by "Does the 404 page not return a 404 status"(m quite new to php)
I mean does the HTTP response status line have a 404 status code or not (I checked; it does). In that case, the a custom error page is not causing problems for script, which doesn't even fetch the page itself. When describing a problem, stick to the symptoms and don't guess at what you think is the cause.

Take a closer look at your HTML page. It doesn't use the jQuery based script, nor does it call the PHP script properly, nor does it use the output of the PHP script properly. It uses a different name for the PHP script. All together, it doesn't look like it's supposed to be used by the other two. However, useHttpResponse() hints at how you can check links using only JS.

What is the benefit of these scripts? What do you want to use them for?
 

Teensweb

New Member
Messages
352
Reaction score
1
Points
0
Thank you mission, but what should I edit to fix the problem with that script?
Edit:
I have uploaded the script here
you see the link www.notexisting.com is marked red, which means it detects server not found errors. But the link www.teensweb.co.cc/notexisting returns a 404 error but it's marked in blue. That's the prob.
 
Last edited:

misson

Community Paragon
Community Support
Messages
2,572
Reaction score
72
Points
48
checklinks.php returns "inactive" for the nonexistant page, so the problem's not there. (It actually returns "inactive" for all but the first page, but we'll get back to that later.) Examine the links in a DOM inspector and you'll discover that all but the first teensweb links have both an active and an inactive class; evidentally, the problem is that somewhere the wrong <a> elements are getting marked as active. The only place this happens is in jquery.linkchecker.js, in the line:
Code:
jLinks.filter('[href^='+ links[i].href + ']').addClass(links[i].status == 'active' ? settings.activeClass : settings.inactiveClass);
Combine that with the fact that checklinks.php returns "inactive" for all but http://www.teensweb.co.cc/ and we can see the problem. The filter when processing http://www.teensweb.co.cc/ is '[href^=http://www.teensweb.co.cc/]', which looks for all href attributes that begin with a given url ("^="). Change "href^=" to "href=" to look for exact matches only.

Once you make that change, you'll notice that all but the first link (the one to http://www.teensweb.co.cc/) will be marked as inactive. This goes back to my earlier remark about checklinks.php only looking for a 200 status when it should return "active" for any 2XX or 3XX status. You can fix the original script or use this altered version:
PHP:
<?php
header("text/plain");

/**
 * jQuery Link Checker 
 * 
 * http://troy.dyle.net
 * Created for: http://russianwebstudio.com
 * Copyright (c) 2007 Anton Sidashin
 * troy [at] simix.ru  
 *
 */


error_reporting(E_ALL);

function get_head($url, $timeout = 3) {

  $info = @parse_url($url);
  $fp = @fsockopen($info["host"], 80, $errno, $errstr, $timeout);

  if (!$fp) {
     return false;
  }
  // Checks the path is not empty
  if (empty($info["path"])) {
    // If it is empty it fills it
    $info["path"] = "/";
  }
  $query = ""; 
    
  if (isset( $info["query"] ) ) {
    $query = "?".$info["query"]."";
  }
  
  $out  = "HEAD ".$info["path"]."".$query." HTTP/1.0\r\n";
  $out .= "Host: ".$info["host"]."\r\n";
  $out .= "Connection: close \r\n" ;
  $out .= "User-Agent: jQuery_LinkChecker/1.1\r\n\r\n";
  
  // write the headers out
  fwrite($fp, $out);

  $html = '';
  while (!feof($fp) ) {
    $html .= fread($fp,8192);
  }
  fclose( $fp );
  
  return $html;
}

function parse_status($statusLine) {
	$status = explode(' ', $statusLine, 3);
	if (count($status) == 3) {
		return array_combine(array('version', 'statusCode', 'statusPhrase'), $status);
	} else {
		return array('statusCode' => '999', 'statusPhrase' => 'unknown error requesting URL');
	}
}

// Get status code
function get_status($url, $timeout=3) {
	$head = explode("\r\n", get_head($url, $timeout), 2);
	return parse_status($head[0]);
}

function page_exists($status) {
  return isset($status['statusCode']) && $status['statusCode'] < 400;
}

if(isset($_GET['links'])) {
	$links = array_unique($_GET['links']);
	if (isset($_GET['timeout'])) {
		$timeout = (int) $_GET['timeout'];
	} else {
		$timeout = 3;
	}
	$result = Array();
	foreach ($links as $l) {
		$status = get_status($l, $timeout);
		$result[] = array(
			'href'=>rtrim($l, '/\\'),
			'statusCode' => $status['statusCode'],
			'status'=>page_exists($status) ? 'active' : 'inactive'
		);
	}

	echo json_encode($result);
}
?>

Edit: I keep forgetting to mention, it's "Misson", not "Mission" (only one "i").
 
Last edited:

Teensweb

New Member
Messages
352
Reaction score
1
Points
0
Sorry misson, i'm very careless (esp abt spellings). But when i edited everything as you said nothing gets highlighted at all! Weird! I'm damn new to php so i can't figure out the problem......
Did you try it out yourself?

Edit:
I just messed up and using my logic got this code
PHP:
<?php
/**
 * jQuery Link Checker 
 * 
 * http://troy.dyle.net
 * Created for: http://russianwebstudio.com
 * Copyright (c) 2007 Anton Sidashin
 * troy [at] simix.ru  
 *
 */


error_reporting(E_ALL);

// this function taken from Drupal ( drupal.org )
function json($var) {
  switch (gettype($var)) {
    case 'boolean':
      return $var ? 'true' : 'false'; // Lowercase necessary!
    case 'integer':
    case 'double':
      return $var;
    case 'resource':
    case 'string':
      return '"'. str_replace(array("\r", "\n", "<", ">", "&"),
                              array('\r', '\n', '\x3c', '\x3e', '\x26'),
                              addslashes($var)) .'"';
    case 'array':    
      if (empty ($var) || array_keys($var) === range(0, sizeof($var) - 1)) {
        $output = array();
        foreach ($var as $v) {
          $output[] = json($v);
        }
        return '[ '. implode(', ', $output) .' ]';
      }
      // Otherwise, fall through to convert the array as an object.
    case 'object':
      $output = array();
      foreach ($var as $k => $v) {
        $output[] = json(strval($k)) .': '. json($v);
      }
      return '{ '. implode(', ', $output) .' }';
    default:
      return 'null';
  }
}

function unique_urls($urls) {
  $uurls = array();
  for ($i=0; isset($urls[$i]); $i++) {   
    if (!in_array($urls[$i], $uurls)) {
      $uurls[] = $urls[$i];
    }
  }
  return $uurls;
}


function get_head($url, $timeout = 3) {

  $info = @parse_url($url);
  $fp = @fsockopen($info["host"], 80, $errno, $errstr, $timeout);

  if (!$fp) {
     return false;
  }
  // Checks the path is not empty
  if (empty($info["path"])) {
    // If it is empty it fills it
    $info["path"] = "/";
  }
  $query = ""; 
    
  if (isset( $info["query"] ) ) {
    $query = "?".$info["query"]."";
  }
  
  $out  = "HEAD ".$info["path"]."".$query." HTTP/1.0\r\n";
  $out .= "Host: ".$info["host"]."\r\n";
  $out .= "Connection: close \r\n" ;
  $out .= "User-Agent: jQuery_LinkChecker/1.1\r\n\r\n";
  
  // write the headers out
  fwrite($fp, $out);
  $html = '';
  

  while (!feof($fp) ) {
    $html .= fread($fp,8192);
  }
  //echo $html . '<br><br>';
  //flush();
  // Closes socket
  fclose( $fp );
  
  return $html;
}

// Get status code
function get_status($header) {

  $headers = explode( "\r\n", $header );
  unset( $header );
  
  if (preg_match("/HTTP\/[0-9A-Za-z +]/i" ,$headers[0])) {   
    $status = preg_replace( "/http\/[0-9]\.[0-9]/i", "", $headers[0] );
    return $status;
  } else {
    return 'Unknown status';
  }
  
}

function url_exists($url, $timeout = 3) {
  $html = get_head($url, $timeout);
  if(empty($html)) {
    return false;
  }   

  $status = get_status($html);
  
  if($status < 400) {
    return true;
  }
    
  return false;
}

 

if(isset($_GET['links'])) {  
  $links = unique_urls($_GET['links']);
  $timeout = (int) $_GET['timeout'];
  
  $result = Array();
  foreach ($links as $l) {
    $result[] = array('href'=>rtrim($l, '/\\') , 'status'=>url_exists($l, $timeout) ? 'active' : 'inactive');
  }

  echo json($result);
} 
?>
I also edited the js as you told and ended up with detecting 404 pages. but still the first link is neither active nor inactive see http://www.teensweb.co.cc/linkchecker-0.2-dev/
Edit:
One more doubt, if notexisting returns status 404, cant the problem just be solved with
" if($status = "200") {
return true;
}
"
?
I see that it can't but just i curiosity you know..
 
Last edited:

misson

Community Paragon
Community Support
Messages
2,572
Reaction score
72
Points
48
I also edited the js as you told and ended up with detecting 404 pages. but still the first link is neither active nor inactive see http://www.teensweb.co.cc/linkchecker-0.2-dev/

Main task in debugging: break it down, see how the parts work. Check the output of checklinks.php. The final "/" in any url is stripped out, while the first link has a trailing "/". Either leave trailing "/" off of links in your HTML or always have a trailing "/" and change checklinks.php.

There's something else wrong with your checklinks.php. "http://www.notexisting.com" exists, but it's being detected as invalid.

Edit:
One more doubt, if notexisting returns status 404, cant the problem just be solved with
" if($status = "200") {
return true;
}
"
?
I see that it can't but just i curiosity you know..
Your statement doesn't make any sense. Do you expect it to work but it didn't when you tested it? Do you not expect it to work?

The answer is it won't. One reason it won't work is that you're using an assignment ("=") not a comparison ("=="). To prevent this , always put the r-value (non-assignable expressions, e.g. constants) on the left side of the comparison. That way, if you accidentally type "=", you get a syntax error.

Another is that you're still just checking for a 200 status, ignoring (e.g.) 203 and 301 status codes. Even the 401, 402 and 403 status codes indicate that a resource exists at a given URI, it's just inaccessible.
 
Top