Crawling

function curl_file_get_contents($url)
{
$curl = curl_init();
$userAgent = ‘Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)’;
try {
curl_setopt($curl,CURLOPT_URL,$url); //The URL to fetch. This can also be set when initializing a session with curl_init().
curl_setopt($curl,CURLOPT_RETURNTRANSFER,TRUE); //TRUE to return the transfer as a string of the return value of curl_exec() instead of outputting it out directly.
curl_setopt($curl,CURLOPT_CONNECTTIMEOUT,5); //The number of seconds to wait while trying to connect.

curl_setopt($curl, CURLOPT_USERAGENT, $userAgent); //The contents of the “User-Agent: ” header to be used in a HTTP request.
curl_setopt($curl, CURLOPT_FAILONERROR, TRUE); //To fail silently if the HTTP code returned is greater than or equal to 400.
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, TRUE); //To follow any “Location: ” header that the server sends as part of the HTTP header.
curl_setopt($curl, CURLOPT_AUTOREFERER, TRUE); //To automatically set the Referer: field in requests where it follows a Location: redirect.
curl_setopt($curl, CURLOPT_TIMEOUT, 10); //The maximum number of seconds to allow cURL functions to execute.

$contents = curl_exec($curl);
curl_close($curl);
} catch(Exception $ex){
echo $ex->getMessage();
}
return $contents;
}

function getFlipkartProductTitle($content){
$reg_exUrl = “/<h1 class=\”title\” itemprop=\”name\”>.+?<\/h1>/is”;
preg_match_all($reg_exUrl, $content, $matches);
$usedPatterns = array();
foreach($matches[0] as $pattern){
if(!array_key_exists($pattern, $usedPatterns)){
$usedPatterns[$pattern]=true;
echo $pattern;
}
}
}
function getFlipkartProductSubtitle($content){
$reg_exUrl = “/<span class=\”subtitle\”>.+?<\/span>/is”;
preg_match_all($reg_exUrl, $content, $matches);
$usedPatterns = array();
foreach($matches[0] as $pattern){
if(!array_key_exists($pattern, $usedPatterns)){
$usedPatterns[$pattern]=true;
echo $pattern;
}
}
}
function getFlipkartProductPrice($content){
$reg_exUrl = “/<span class=\”selling-price.+?\”>.+?<\/span>/is”;
preg_match_all($reg_exUrl, $content, $matches);
$usedPatterns = array();
echo $matches[0][0];
}

function getSnapDealProductTitle($content){
$reg_exUrl = “/<h1 itemprop=\”name\”>.+?<\/h1>/is”;
preg_match_all($reg_exUrl, $content, $matches);
$usedPatterns = array();
foreach($matches[0] as $pattern){
if(!array_key_exists($pattern, $usedPatterns)){
$usedPatterns[$pattern]=true;
echo $pattern;
}
}
}
//$content = curl_file_get_contents(“http://www.flipkart.com/dimpy-stuff-bear-cuddles-16-53-inch/p/itme3ws6ueuhmzwe?pid=STFE3WS6Q8PYZZDF&#8221;);
//getFlipkartProductTitle($content);
//getFlipkartProductSubtitle($content);
//echo ‘<br />’;
//getFlipkartProductPrice($content);
//echo ‘<br />SnapDeal’;
function getSnapDealContent($url){
$handle = curl_init();
curl_setopt($handle, CURLOPT_URL, $url);
curl_setopt($handle, CURLOPT_HEADER, true);
curl_setopt($handle, CURLOPT_USERAGENT, ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0.1’);
curl_setopt($handle, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, 1);

$result = curl_exec($handle);

return $result;

curl_close($handle);
}
$content1 = getSnapDealContent(“http://www.snapdeal.com/product/trident-hot-pink-cotton-towel/1759907333&#8221;);
getSnapDealProductTitle($content1);

Advertisements

Identifying Comments in A Source Code Using Regular Expression

Here is a PHP code which will open any source code file and determines all the occurances of comments ( /* … */ and //) and prints the comments.

<?php
function get_comments($text){

$reg_exp = ‘((/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/)|(//.*))’;
preg_match_all($reg_exp, $text, $matches);
$usedPatterns = array();
foreach($matches[0] as $pattern){
if(!array_key_exists($pattern, $usedPatterns)){
$usedPatterns[$pattern]=true;
echo $pattern=$pattern.” \n “;

}
}
}
$_fp = fopen(“source_code.txt”, “r”);
/* Enter your code here. Read input from STDIN. Print output to STDOUT */
$str=””;
while(!feof($_fp))
{
$str=$str.fgets($_fp);
}
get_comments($str);
?>