<?PHP class grab{ private $cityList = array(); /** 获取地区电视台的链接 */ function getCityUrl($url){ $content = vget($url); //抓取地区数据 $rule = '/class="pgnav">.*?<\/table>/'; preg_match_all($rule,$content,$result); //获取电视台链接 $rule = '/<a.*?<\/a>/'; preg_match_all($rule,$result[0][0],$listOne); $listOne = $listOne[0]; foreach($listOne as $value){ //获取title信息 $rule = '/title="(.*?)"/'; preg_match_all($rule,$value,$name); $name = substr($name[0][0],7,strlen($name[0][0])); $name = substr($name,stripos($name,'"')); //获取href信息 $rule = '/href=".*?"/'; preg_match_all($rule,$link); $link = 'http://www.tvmao.com'.substr($link[0][0],6,strlen($link[0][0])); $link = substr($link,stripos($link,'"')); array_push($this->cityList,array('name'=>$name,'link'=>$link)); } return $this->cityList; } /** 获取地方的电视台链接 $url ---- 要抓取的链接 $sign ---- 是否抓取下属链接内容 */ function getCityTvStation($url = null,$sign = false){ $forReturn = array(); $content = vget($url); $tempReturn = array('name'=>null,'list'=>array()); //获取当前电视台的名称 $rule = '/<div class="pbar">.*?<\/div>/'; preg_match_all($rule,$name); $name = $name[0][0]; $rule = '/<b>.*?<\/b>/'; preg_match_all($rule,$name,$name); $name = $name[0][0]; $name = substr($name,3,strlen($name)); $name = substr($name,'<')); $tempReturn['name'] = $name; //获取台标 $rule = '/<h1 style="float:left">.*?<\/h1>/'; preg_match_all($rule,$title); $title = $title[0][0]; $rule = '/<h1 style="float:left">/'; $title = preg_replace($rule,'',$title); $rule = '/<\/h1>/'; $title = preg_replace($rule,$title); //获取当前台标 $rule = '/<img.*?\/>/'; preg_match_all($rule,$title,$logo); $logo = $logo[0][0]; $rule = '/src=".*?"/'; preg_match_all($rule,$logo,$logo); $logo = $logo[0][0]; $logo = substr($logo,5,strlen($logo)); $logo = substr($logo,stripos($logo,'"')); $tempReturn['logo'] = $logo; //获取当前二级电台的下属三级电台的链接列表 //获取ul块 $rule = '/<ul class="r".*?<\/ul>/'; preg_match_all($rule,$ulList); $ulList = $ulList[0][0]; //获取单个li项目信息 $rule = '/<li>.*?<\/li>/'; preg_match_all($rule,$ulList,$liList); $tempList = array(); //获取当前频道的子台列表 //获取当前当前频道的名称 $material = $liList[0][0]; $rule = '/<b>.*?<\/b>/'; preg_match_all($rule,$material,$name); $nameTemp = substr($name[0][0],stripos($name[0][0],'/')); $nameTemp = substr($nameTemp,stripos($nameTemp,'<')); $linkTemp = $url;//链接地址 array_push($tempList,array('name'=>$nameTemp,'link'=>$linkTemp)); //获取余下的节目列表 $liList = $liList[0]; unset($liList[0]); //获取下属的电视台详细地址 foreach($liList as $value){ //获取电视台地址 $rule = '/href=".*?"/'; preg_match_all($rule,$link); $link = $link[0][0]; $link = substr($link,strlen($link)); $link = substr($link,'"')); $link = "http://www.tvmao.com{$link}"; //获取电视台名称 $rule = '/">.*?<\/a>/'; preg_match_all($rule,$name); $name = $name[0][0]; $name = substr($name,2,strlen($name)); $name = substr($name,'<')); array_push($tempList,'link'=>$link)); } $tempReturn['list'] = $tempList; array_push($forReturn,$tempReturn); //如果需要获取同等级电视台列表 if($sign){ //去除ul里面的干扰内容 $rule = '/<ul class="r" >.*?<\/ul>/'; $content = preg_replace($rule,$content); //获取另外当前地方的频道的链接 $rule = '/<a href=".*?"><div class="plst">.*?<\/b><\/div><\/a>/'; preg_match_all($rule,$listTvStation); $listTvStation = $listTvStation[0]; foreach($listTvStation as $value){ //获取链接地址 $rule = '/<a href=".*?">/'; preg_match_all($rule,$linkTemp); $linkTemp = $linkTemp[0][0]; $link = substr($linkTemp,9,strlen($link)); $link = 'http://www.tvmao.com'.substr($link,'"')); //获取链接名称 $rule = '/<b>.*?<\/b>/'; preg_match_all($rule,$nameTemp); $nameTemp = $nameTemp[0][0]; $name = substr($nameTemp,strlen($nameTemp)); $name = substr($name,'<')); //压入暂时结果集中 //获取url下的内容 $result = $this->getCityTvStation($link,false); $result = $result[0]; $result['name'] = $name; array_push($forReturn,$result); } } return $forReturn; } /** 获取电视台的星期列表 */ public function getWeekList($url){ $content = vget($url); //获取星期列表 //获取星期内的数据 $rule = '/<div class="epghdc lt".*?<\/div>/'; preg_match_all($rule,$week); $week = $week[0][0]; //获取一周链接 $rule = '/<dd.*?<\/dd>/'; preg_match_all($rule,$week,$week); $week = $week[0]; //去除 上周 的链接 unset($week[0]); $forReturn = array(); $i = 1; foreach($week as $value){ if(8 == $i){ break; } $forReturn[$i]['day'] = $i; //获取周一的链接 $rule = '/href=".*?"/'; preg_match_all($rule,$tempResult); $tempResult = $tempResult[0]; if(empty($tempResult)){ $link = $url; $forReturn[$i]['link'] = $link; }else{ $link = substr($tempResult[0],strlen($tempResult[0])); $link = 'http://www.tvmao.com'.substr($link,'"')); $forReturn[$i]['link'] = $link; } //获取日期信息 $rule = '/\(.*?\)/'; preg_match_all($rule,$date); $date = $date[0][0]; //去除括号 $date = substr($date,1,strlen($date)-2); $date = explode('-',$date); $date = $date[0].$date[1]; $forReturn[$i++]['date'] = $date; } return $forReturn; } //获取某天的节目列表 public function getChildsList($url){ $forReturn = array(); $content = vget($url); $rule = '/<h1 style="float:left">.*?<\/h1>/'; preg_match_all($rule,$title); $name = substr($title,stripos($title,'>')+1,strlen($title));//电视台名称 $rule = '/<a href=.*?<\/a>/'; $name = preg_replace($rule,$name); $forReturn['titleName'] = $name; //获取上午的节目 $amResult = array(); $rule = '/<li><span class="am">.*<\/li>/'; preg_match_all($rule,$am); foreach($am[0] as $value){ $amTemp = array(); $amTemp['time'] = $this->getTimeAm($value); $amTemp['name'] = $this->getContent($value); $amTemp['link'] = $this->getLink($value); array_push($amResult,$amTemp); } $forReturn['am'] = $amResult; //获取下午的节目 $pmResult = array(); $rule = '/<li><span class="pm">.*<\/li>/'; preg_match_all($rule,$pm); foreach($pm[0] as $value){ $pmTemp = array(); $pmTemp['time'] = $this->getTimePm($value); $pmTemp['name'] = $this->getContent($value); $pmTemp['link'] = $this->getLink($value); array_push($pmResult,$pmTemp); } $forReturn['pm'] = $pmResult; //获取晚上的节目 $ntResult = array(); $rule = '/<li><span class="nt">.*<\/li>/'; preg_match_all($rule,$nt); foreach($nt[0] as $value){ $ntTemp = array(); $ntTemp['time'] = $this->getTimeNt($value); $ntTemp['name'] = $this->getContent($value); $ntTemp['link'] = $this->getLink($value); array_push($ntResult,$ntTemp); } $forReturn['nt'] = $ntResult; return $forReturn; } //上午获取时间 private function getTimeAm($content){ $rule = '/<span class="am">.*<\/span>/'; preg_match_all($rule,$time); $time = $time[0][0]; $time = substr($time,17,strlen($time)); $time = substr($time,stripos($time,'<')); return $time; } //下午获取时间 private function getTimePm($content){ $rule = '/<span class="pm">.*<\/span>/'; preg_match_all($rule,'<')); return $time; } //晚上获取时间 private function getTimeNt($content){ $rule = '/<span class="nt">.*<\/span>/'; preg_match_all($rule,'<')); return $time; } //获取内容 private function getContent($content){ $rule = '/<\/span>.*?<\/li>/'; preg_match_all($rule,$name); $name = $name[0][0]; //去除a标签 $rule = '/<a title=.*?<\/a>/'; $name = preg_replace($rule,$name); $rule = '/<a.*?>/'; $name = preg_replace($rule,$name); $rule = '/<\/a>/'; $name = preg_replace($rule,$name); $rule = '/<\/span> /'; $name = preg_replace($rule,$name); $rule = '/\s*<\/li>/'; $name = preg_replace($rule,$name); //去除div内容 $rule = '/<div class="tvgd".*?<\/div>/'; $name = preg_replace($rule,$name); $rule = '/<div class="tvcgd".*?<\/div>/'; $name = preg_replace($rule,$name); return $name; } //获取链接 private function getLink($content){ $rule = '/<\/span>.*?<\/li>/'; preg_match_all($rule,$name); $name = $name[0][0]; //去除div内容 $rule = '/<div class="tvgd".*?<\/div>/'; $name = preg_replace($rule,$name); //去除不需要的内容 $rule = '/<a title.*?<\/a>/'; $name = preg_replace($rule,$name); //获取链接内容 $rule = '/href=".*?"/'; preg_match_all($rule,$result); $result = $result[0]; if(empty($result)){ return null; }else{ $result = substr($result[0],strlen($result[0])); $result = substr($result,stripos($result,'"')); return 'http://www.tvmao.com'.$result; } } private function getChildInfo(){ } } ?>
这是一个单子的样品代码,
可惜忙活了两天,最后单子还是丢了,
可惜,多好玩的项目。