Copy Of Php Web Spider 2

  • November 2019
  • PDF

This document was uploaded by user and they confirmed that they have the permission to share it. If you are author or own the copyright of this book, please report to us by using this DMCA report form. Report DMCA


Overview

Download & View Copy Of Php Web Spider 2 as PDF for free.

More details

  • Words: 530
  • Pages: 3
read())) { $a = explode("-", $entry); if ( is_numeric($a[0])) { $last_index = $a[0]; } } $d->close(); $last_index++; $local_files_path = $local_files_dir . "/" . str_pad($last_index, 8, "0", STR_PAD_LEFT) . "-" . $today; if ( !is_dir($local_files_path) ) { mkdir($local_files_path, 0777, true); } // store variables in session $_SESSION["local_files_path"] = $local_files_path; } else { // the spidering is in progess, read variables from session $local_files_path = $_SESSION["local_files_path"]; // now ready for parsing, go!!! get_and_parse_url($url . "/category/PC+Components", 1); } function get_and_parse_url($url_to_parse, $level) { global global global global

$url; $root_filename; $local_files_path; $max_level;

if ($level > $max_level) return;

$filename = $root_filename; $save_to_dir = $local_files_path; echo "<pre>"; echo "get_and_parse_url('$url_to_parse', '$save_to_dir', $level)\n"; $relative_url = substr($url_to_parse, strlen($url) + 1); echo "relative_url='$relative_url'\n"; if (strpos($relative_url, "/") != false) { $save_to_dir = $save_to_dir . "/" . substr($relative_url, 0, strrpos($relative_url, "/")); if ( !is_dir($save_to_dir) ) { mkdir($save_to_dir, 0777, true); } $filename = substr($relative_url, strrpos($relative_url, "/") + 1); } else if (strlen($relative_url) > 1) { $filename = $relative_url; } if ( (substr($filename, strlen($filename) - 4) != ".htm") && (substr($filename, strlen($filename) - 5) != ".html") ) { $filename = $filename . ".html"; } echo "save_to_dir='$save_to_dir'\n"; echo "filename='$filename'\n"; // get the html from remote server $html = do_post_request( $url_to_parse ); // store fetched html to local dir file_put_contents( $save_to_dir . "/" . $filename, $html ); // now parse the html to get more links and data // //

$filename = "data/00000001-2008-09-27/index.html"; $html = file_get_contents($filename);

$html = str_replace("\"", "'", $html); //echo htmlentities($html); // get all 's with regex preg_match_all("|]+>(.*)]+>|U", $html, $matches); // loop through the matches with foreach $i = 0; foreach($matches[0] as $value) { //echo htmlentities($value) . "\n"; // filter the one that we need if (strstr($value, "font-weight: bold;")) { $i++; //echo htmlentities($value) . "\n"; preg_match("/href='(.*)'/", $value, $m); $links[$i] = trim($m[1]);

// fix relative links if (substr($links[$i], strlen($url)) != $url) { if (substr($links[$i], 1) != "/") { $links[$i] = "/" . substr($links[$i], 1); } $links[$i] = $url . $links[$i]; } } }

}

// check if links are found if ($links == null) { // no links of interest are found } else { $links = array_unique($links); // print the found links of interest foreach($links as $value) { echo htmlentities($value) . "\n"; get_and_parse_url($value, $level + 1); // break; } } echo "";

function do_post_request($url, $data = null, $optional_headers = null) { $params = array('http' => array( 'method' => 'POST', 'content' => $data )); if ($optional_headers !== null) { $params['http']['header'] = $optional_headers; } $ctx = stream_context_create($params); $fp = @fopen($url, 'rb', false, $ctx); if (!$fp) { throw new Exception("Problem with $url, $php_errormsg"); } $response = @stream_get_contents($fp); if ($response === false) { throw new Exception("Problem reading data from $url, $php_errormsg"); } return $response; } ?>

Related Documents

Copy Of Php Web Spider
November 2019 10
Copy Of Php Web Spider 2
November 2019 1
Spider-man Web Of Shadows
October 2019 19
Spider 2
April 2020 1
Spider
April 2020 12
Spider
June 2020 7