read())) { $a = explode("-", $entry); if ( is_numeric($a[0])) { $last_index = $a[0]; } } $d->close(); $last_index++; $local_files_path = $local_files_dir . "/" . str_pad($last_index, 8, "0", STR_PAD_LEFT) . "-" . $today; if ( !is_dir($local_files_path) ) { mkdir($local_files_path, 0777, true); } // store variables in session $_SESSION["local_files_path"] = $local_files_path; } else { // the spidering is in progess, read variables from session $local_files_path = $_SESSION["local_files_path"]; // now ready for parsing, go!!! get_and_parse_url($url . "/category/PC+Components", 1); } function get_and_parse_url($url_to_parse, $level) { global global global global
$url; $root_filename; $local_files_path; $max_level;
if ($level > $max_level) return;
$filename = $root_filename; $save_to_dir = $local_files_path; echo "<pre>"; echo "get_and_parse_url('$url_to_parse', '$save_to_dir', $level)\n"; $relative_url = substr($url_to_parse, strlen($url) + 1); echo "relative_url='$relative_url'\n"; if (strpos($relative_url, "/") != false) { $save_to_dir = $save_to_dir . "/" . substr($relative_url, 0, strrpos($relative_url, "/")); if ( !is_dir($save_to_dir) ) { mkdir($save_to_dir, 0777, true); } $filename = substr($relative_url, strrpos($relative_url, "/") + 1); } else if (strlen($relative_url) > 1) { $filename = $relative_url; } if ( (substr($filename, strlen($filename) - 4) != ".htm") && (substr($filename, strlen($filename) - 5) != ".html") ) { $filename = $filename . ".html"; } echo "save_to_dir='$save_to_dir'\n"; echo "filename='$filename'\n"; // get the html from remote server $html = do_post_request( $url_to_parse ); // store fetched html to local dir file_put_contents( $save_to_dir . "/" . $filename, $html ); // now parse the html to get more links and data // //
$filename = "data/00000001-2008-09-27/index.html"; $html = file_get_contents($filename);
$html = str_replace("\"", "'", $html); //echo htmlentities($html); // get all
's with regex preg_match_all("|]+>(.*)]+>|U", $html, $matches); // loop through the matches with foreach $i = 0; foreach($matches[0] as $value) { //echo htmlentities($value) . "\n"; // filter the one that we need if (strstr($value, "font-weight: bold;")) { $i++; //echo htmlentities($value) . "\n"; preg_match("/href='(.*)'/", $value, $m); $links[$i] = trim($m[1]);
// fix relative links if (substr($links[$i], strlen($url)) != $url) { if (substr($links[$i], 1) != "/") { $links[$i] = "/" . substr($links[$i], 1); } $links[$i] = $url . $links[$i]; } } }
}
// check if links are found if ($links == null) { // no links of interest are found } else { $links = array_unique($links); // print the found links of interest foreach($links as $value) { echo htmlentities($value) . "\n"; get_and_parse_url($value, $level + 1); // break; } } echo "";
function do_post_request($url, $data = null, $optional_headers = null) { $params = array('http' => array( 'method' => 'POST', 'content' => $data )); if ($optional_headers !== null) { $params['http']['header'] = $optional_headers; } $ctx = stream_context_create($params); $fp = @fopen($url, 'rb', false, $ctx); if (!$fp) { throw new Exception("Problem with $url, $php_errormsg"); } $response = @stream_get_contents($fp); if ($response === false) { throw new Exception("Problem reading data from $url, $php_errormsg"); } return $response; } ?>