CyberDenix | Je te conseille de faire un shell_exec si tu es sous unix.
Le principe est d'avoir un père et des fils, mais dutiliser le même fichier.php pour cela, que le père va relancer en ligne de commande en passant en argument une marque de sa paternité (très utile dans le cas de plusieurs pères, pour savoir à quel papa tel ou tel gosse appartient)
Voici, en speed, une partie du code que j'utilise pour un crawler de site en PHP :
Code :
- /**
- *
- */
- public function crawl($options = null) {
- if (!isset($options['--url']) || !$options['--url']) {
- return false;
- }
- $time = microtime(true);
- // Master-url initialization
- // ---------------------------------------------------------------------
- if (isset($options['--master-url'])) {
- $master_url = $options['--master-url'];
- } else {
- $master_url = $options['--url'];
- }
- // Master-id initialization
- // ---------------------------------------------------------------------
- if (isset($options['--master-id'])) {
- $master_id = $options['--master-id'];
- } else {
- $command_line = '';
- if (isset($options) && $options) {
- $command_line = PHP_CLI.' '.WSM_CRAWLER_PATH.'crawler.php';
- foreach ($options as $k => $v) {
- $command_line .= ' '.$k.' ';
- if (is_bool($v)) {
- $command_line .= (int)$v;
- } else if (is_numeric($v)) {
- $command_line .= (int)$v;
- } else {
- $command_line .= '\''.$v.'\'';
- }
- }
- }
- $master_id = $this->getDatabase()->exec('INSERT INTO `crawl` (`url`, `command_line`, `start_date`) VALUES (?, ?, ?)', array($options['--url'], $command_line, date('c')), null, false);
- $options['--id'] = $this->insertUrls(array($options['--url']), $master_id);
- }
- // Parse the url
- // ---------------------------------------------------------------------
- $parse_result = $this->parse($options);
- if (isset($parse_result['error'])) {
- file_put_contents('./curlerror.txt', $parse_result['error']['id'].' - '.$parse_result['error']['message'], FILE_APPEND | LOCK_EX);
- }
- // Retry (X times maximum) if a 500 error is catched
- // ---------------------------------------------------------------------
- $max_retries = (isset($options['--500-max-retries']) && $options['--500-max-retries']) ? $options['--500-max-retries'] : 3;
- if ($parse_result['http_code'] == 500) {
- $wait_between_retries = (isset($options['--500-wait-between-retries']) && $options['--500-wait-between-retries']) ? $options['--500-wait-between-retries'] : 10000;
- for ($i = 0; $i < $max_retries; ++$i) {
- usleep($wait_between_retries);
- $parse_result = $this->parse($options);
- if ($parse_result['http_code'] != 500) {
- break;
- }
- }
- }
- // Add new urls to the master crawler queue
- // ---------------------------------------------------------------------
- if (isset($parse_result['links'])) {
- $white_list_filters = array();
- if (isset($options['--white-list']) && $options['--white-list']) {
- $in_white_list = false;
- if (strpos($options['--white-list'], '|') !== false) {
- $white_list_filters = explode('|', $options['--white-list']);
- } else {
- $white_list_filters = array($options['--white-list']);
- }
- }
- $black_list_filters = array();
- if (isset($options['--black-list']) && $options['--black-list']) {
- $in_black_list = false;
- if (strpos($options['--black-list'], '|') !== false) {
- $black_list_filters = explode('|', $options['--black-list']);
- } else {
- $black_list_filters = array($options['--black-list']);
- }
- }
- $urls = array();
- foreach ($parse_result['links'] as $link) {
- if (isset($link['href']) && $link['href']) {
- $url = $link['href'];
- // Auto-prepend domain to domain-less urls
- // ---------------------------------------------------------
- $url = preg_replace('`^/`Usiu', $this->getDomain($master_url).'/', $url);
- // Delete anchors
- // ---------------------------------------------------------
- $url = preg_replace('`#[^/]`Usiu', '', $url);
- // If url is on the same level than the master url
- // ---------------------------------------------------------
- if ($this->getDomain($master_url) == $this->getDomain($url)) {
- $in_white_list = true;
- $in_black_list = false;
- // If url satisfies at least one of the url filters
- // ---------------------------------------------------------
- if ($white_list_filters) {
- $in_white_list = false;
- foreach ($white_list_filters as $filter) {
- if (preg_match(trim($filter), $url)) {
- $in_white_list = true;
- break;
- }
- }
- }
- // If url do not satisfies any of the url filters
- // ---------------------------------------------------------
- if ($black_list_filters) {
- foreach ($black_list_filters as $filter) {
- if (preg_match(trim($filter), $url)) {
- $in_black_list = true;
- break;
- }
- }
- }
- if ($in_white_list && !$in_black_list) {
- // If there is no preexisting url in the web page
- // -------------------------------------------------
- if (!isset($urls[$url])) {
- // If there is no preexisting url in database
- // ---------------------------------------------
- $res = $this->getDatabase()->req('SELECT 1
- FROM `url` `U`
- WHERE `U`.`fk_crawl` = ?
- AND `U`.`url` = ?', array($master_id, $url), null, false);
- if (!$res) {
- $urls[$url] = $url;
- }
- }
- }
- } else {
- //echo 'The following url was excluded : '.$url."\n".'because '.$this->getDomain($master_url).' != '.$this->getDomain($url).''."\n";
- }
- }
- }
- if ($urls) {
- $this->insertUrls($urls, $master_id);
- }
- }
- // Update the url information
- // ---------------------------------------------------------------------
- $this->updateUrl($parse_result, $options['--id']);
- // Master's management of crawling processes
- // ---------------------------------------------------------------------
- if (!isset($options['--master-id'])) {
- $nb_processes = $master_id ? $this->getNbProcesses($master_id) : 0;
- $max_processes = (isset($options['--max-processes']) && $options['--max-processes'] > 0) ? $options['--max-processes'] : 10;
- $start = true;
- $nb_parsed_urls = 0;
- $wait_for_url_label = '[Crawler '.$master_id.'] Waiting for urls ... ';
- $wait_for_url_label_length = mb_strlen($wait_for_url_label);
- $wait_for_url_token = array('-', '/', '|', '\\');
- $wait_for_url_token_count = count($wait_for_url_token);
- $max_processes_label = '[Crawler '.$master_id.'] Waiting for free processes ... ';
- $max_processes_label_length = mb_strlen($max_processes_label);
- $max_processes_token = array('-', '/', '|', '\\');
- $max_processes_token_count = count($max_processes_token);
- $urls = $this->getUnparsedUrls($master_id, $max_processes);
- while ($urls || $start) {
- $start = false;
- foreach ($urls as $url) {
- ++$nb_parsed_urls;
- // Do not exceed the --max-urls limit
- // ---------------------------------------------------------
- if (isset($options['--max-urls']) && $options['--max-urls'] > 0) {
- if ($nb_parsed_urls >= $options['--max-urls']) {
- echo '[Crawler '.$master_id.'] --max-urls '.$options['--max-urls'].' reached, stop crawl'."\n";
- break(2);
- }
- }
- // Do not exceed the --max-processes limit
- // ---------------------------------------------------------
- $w = 0;
- while ($nb_processes >= $max_processes) {
- if (!$w) {
- echo $max_processes_label;
- } else {
- $w2_max = mb_strlen($max_processes_token[($w % $max_processes_token_count)]);
- for ($w2 = 0; $w2 < $w2_max; ++$w2) {
- echo chr(8);
- }
- }
- echo $max_processes_token[(++$w % $max_processes_token_count)];
- //echo '[Crawler '.$master_id.'] --max-processes '.$nb_processes.' reached, wait '.$usleep.' ms'."\n";
- usleep(1000);
- $nb_processes = $this->getNbProcesses($master_id);
- }
- if ($w) {
- $w2_max = $max_processes_label_length;
- for ($w2 = 0; $w2 < $w2_max + 1; ++$w2) {
- echo chr(8);
- }
- }
- // Lock the url, to not parse it twice
- // ---------------------------------------------------------
- $this->getDatabase()->exec('UPDATE `url` SET `fk_parsing_status` = ? WHERE `id` = ?', array(2, $url['id']), null, false);
- // Instanciate a crawling process in background
- // ---------------------------------------------------------
- $command_line = PHP_CLI.' '.WSM_CRAWLER_PATH.'crawler.php';
- $command_line .= ' --id \''.$url['id'].'\'';
- $command_line .= ' --url \''.$url['url'].'\'';
- $command_line .= ' --master-id \''.$url['masterId'].'\'';
- $command_line .= ' --master-url \''.$url['masterUrl'].'\'';
- foreach ($options as $option_name => $option_value) {
- if ($option_name !== '--id' &&
- $option_name !== '--url' &&
- $option_name !== '--master-id' &&
- $option_name !== '--master-url' &&
- $option_name !== '--max-processes' &&
- $option_name !== '--max-urls' )
- $command_line .= ' '.$option_name.' \''.$option_value.'\'';
- }
- // Comment this line can help to see error messages related to wrong paths
- $command_line .= ' > /dev/null'.' 2>/dev/null'.' &';
- //echo 'Executing '.$command_line."\n";
- shell_exec($command_line);
- $nb_processes = $this->getNbProcesses($master_id);
- echo '[Crawler '.$master_id.'] Process '.$url['url'].' ('.$nb_processes.' process'.(($nb_processes < 2) ? '' : 'es').')'."\n";
- }
- // As threads are running in background, the program could finish
- // because no url has been parsed (and no new url has been added
- // to the list) before the end of the launch of the X firsts threads.
- // That's why it is required to wait for some new urls, or kill
- // the wait loop if all urls have been crawled without finding
- // no new url.
- $w = 0;
- while (!($urls = $this->getUnparsedUrls($master_id, $max_processes))) {
- if (!$w) {
- echo $wait_for_url_label;
- } else {
- $w2_max = mb_strlen($wait_for_url_token[($w % $wait_for_url_token_count)]);
- for ($w2 = 0; $w2 < $w2_max; ++$w2) {
- echo chr(8);
- }
- }
- echo $wait_for_url_token[(++$w % $wait_for_url_token_count)];
- usleep(1000);
- $rs = $this->getDatabase()->req(' SELECT 1
- FROM `url` `U`
- WHERE `U`.`fk_crawl` = ?
- AND `U`.`fk_parsing_status` != ?', array($master_id, 3), null, false);
- if (!$rs) {
- break;
- }
- }
- if ($w) {
- $w2_max = $wait_for_url_label_length;
- for ($w2 = 0; $w2 < $w2_max + 1; ++$w2) {
- echo chr(8);
- }
- }
- }
- // Master waits for its crawling processes to finish
- // -----------------------------------------------------------------
- $previous_nb_processes = $nb_processes;
- while (($nb_processes = $this->getNbProcesses($master_id))) {
- usleep(2000);
- if ($nb_processes != $previous_nb_processes) {
- echo '[Crawler '.$master_id.'] No new url detected, waiting after '.$nb_processes.' process'.(($nb_processes < 2) ? '' : 'es').' to finish'."\n";
- }
- $previous_nb_processes = $nb_processes;
- }
- $this->getDatabase()->exec('UPDATE `crawl` SET `end_date` = ?', array(date('c')), null, false);
- echo '[Crawler '.$master_id.'] Crawl finished : '.$nb_parsed_urls.' urls parsed in '.(microtime(true) - $time).' s'."\n";
- }
- }
|
La partie qui t'intéresse tout particulièrement est :
Code :
- // Instanciate a crawling process in background
- // ---------------------------------------------------------
- $command_line = PHP_CLI.' '.WSM_CRAWLER_PATH.'crawler.php';
- $command_line .= ' --id \''.$url['id'].'\'';
- $command_line .= ' --url \''.$url['url'].'\'';
- $command_line .= ' --master-id \''.$url['masterId'].'\'';
- $command_line .= ' --master-url \''.$url['masterUrl'].'\'';
- foreach ($options as $option_name => $option_value) {
- if ($option_name !== '--id' &&
- $option_name !== '--url' &&
- $option_name !== '--master-id' &&
- $option_name !== '--master-url' &&
- $option_name !== '--max-processes' &&
- $option_name !== '--max-urls' )
- $command_line .= ' '.$option_name.' \''.$option_value.'\'';
- }
- // Comment this line can help to see error messages related to wrong paths
- $command_line .= ' > /dev/null'.' 2>/dev/null'.' &';
- //echo 'Executing '.$command_line."\n";
- shell_exec($command_line);
- $nb_processes = $this->getNbProcesses($master_id);
|
Message édité par CyberDenix le 18-08-2011 à 11:28:25 ---------------
Directeur Technique (CTO)
|