正则表达式(二):抓取tvmao电视节目的类

前端之家收集整理的这篇文章主要介绍了正则表达式(二):抓取tvmao电视节目的类前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
<?PHP
class grab{
	private $cityList = array();
	/**
		获取地区电视台的链接
	*/
	function getCityUrl($url){
		$content = vget($url);
		//抓取地区数据
		$rule = '/class="pgnav">.*?<\/table>/';
		preg_match_all($rule,$content,$result);

		//获取电视台链接
		$rule = '/<a.*?<\/a>/';
		preg_match_all($rule,$result[0][0],$listOne);

		$listOne = $listOne[0];
		foreach($listOne as $value){
			//获取title信息
			$rule = '/title="(.*?)"/';
			preg_match_all($rule,$value,$name);
			$name = substr($name[0][0],7,strlen($name[0][0]));
			$name = substr($name,stripos($name,'"'));
			//获取href信息
			$rule = '/href=".*?"/';
			preg_match_all($rule,$link);
			$link = 'http://www.tvmao.com'.substr($link[0][0],6,strlen($link[0][0]));
			$link = substr($link,stripos($link,'"'));
			
			array_push($this->cityList,array('name'=>$name,'link'=>$link));
		}
		return $this->cityList;
	}
	/**
		获取地方的电视台链接
		$url ---- 要抓取的链接
		$sign ---- 是否抓取下属链接内容
	*/
	function getCityTvStation($url = null,$sign = false){
		$forReturn = array();
		$content = vget($url);
		$tempReturn = array('name'=>null,'list'=>array());
		
		//获取当前电视台的名称
		$rule = '/<div class="pbar">.*?<\/div>/';
		preg_match_all($rule,$name);
		$name = $name[0][0];
		$rule = '/<b>.*?<\/b>/';
		preg_match_all($rule,$name,$name);
		$name = $name[0][0];
		$name = substr($name,3,strlen($name));
		$name = substr($name,'<'));
		$tempReturn['name'] = $name;
		//获取台标
		$rule = '/<h1 style="float:left">.*?<\/h1>/';
		preg_match_all($rule,$title);
		$title = $title[0][0];
		$rule = '/<h1 style="float:left">/';
		$title = preg_replace($rule,'',$title);
		$rule = '/<\/h1>/';
		$title = preg_replace($rule,$title);
		//获取前台标
		$rule = '/<img.*?\/>/';
		preg_match_all($rule,$title,$logo);
		$logo = $logo[0][0];
		$rule = '/src=".*?"/';
		preg_match_all($rule,$logo,$logo);
		$logo = $logo[0][0];
		$logo = substr($logo,5,strlen($logo));
		$logo = substr($logo,stripos($logo,'"'));
		$tempReturn['logo'] = $logo;
		//获取当前二级电台的下属三级电台的链接列表
		//获取ul块
		$rule = '/<ul class="r".*?<\/ul>/';
		preg_match_all($rule,$ulList);
		$ulList = $ulList[0][0];
		//获取单个li项目信息
		$rule = '/<li>.*?<\/li>/';
		preg_match_all($rule,$ulList,$liList);
		
		$tempList = array();
		//获取当前频道的子台列表
		//获取当前当前频道的名称
		$material = $liList[0][0];
		$rule = '/<b>.*?<\/b>/';
		preg_match_all($rule,$material,$name);
		$nameTemp = substr($name[0][0],stripos($name[0][0],'/'));
		$nameTemp = substr($nameTemp,stripos($nameTemp,'<'));
		$linkTemp = $url;//链接地址
		array_push($tempList,array('name'=>$nameTemp,'link'=>$linkTemp));
		
		//获取余下的节目列表
		$liList = $liList[0];
		unset($liList[0]);
		//获取下属的电视台详细地址
		foreach($liList as $value){
			//获取电视台地址
			$rule = '/href=".*?"/';
			preg_match_all($rule,$link);
			$link = $link[0][0];
			$link = substr($link,strlen($link));
			$link = substr($link,'"'));
			$link = "http://www.tvmao.com{$link}";
			//获取电视台名称
			$rule = '/">.*?<\/a>/';
			preg_match_all($rule,$name);
			$name = $name[0][0];
			$name = substr($name,2,strlen($name));
			$name = substr($name,'<'));
			array_push($tempList,'link'=>$link));
		}
		$tempReturn['list'] = $tempList;
		array_push($forReturn,$tempReturn);
		
		//如果需要获取同等级电视台列表
		if($sign){
			//去除ul里面的干扰内容
			$rule = '/<ul class="r" >.*?<\/ul>/';
			$content = preg_replace($rule,$content);
			//获取另外当前地方的频道的链接
			$rule = '/<a href=".*?"><div class="plst">.*?<\/b><\/div><\/a>/';
			preg_match_all($rule,$listTvStation);
			
			$listTvStation = $listTvStation[0];
			foreach($listTvStation as $value){
				//获取链接地址
				$rule = '/<a href=".*?">/';
				preg_match_all($rule,$linkTemp);
				$linkTemp = $linkTemp[0][0];
				$link = substr($linkTemp,9,strlen($link));
				$link = 'http://www.tvmao.com'.substr($link,'"'));
				//获取链接名称
				$rule = '/<b>.*?<\/b>/';
				preg_match_all($rule,$nameTemp);
				$nameTemp = $nameTemp[0][0];
				$name = substr($nameTemp,strlen($nameTemp));
				$name = substr($name,'<'));
				//压入暂时结果集中
				//获取url下的内容
				$result = $this->getCityTvStation($link,false);
				$result = $result[0];
				$result['name'] = $name;
				array_push($forReturn,$result);
			}
		}
		return $forReturn;
	}
	/**
		获取电视台的星期列表
	*/
	public function getWeekList($url){
		$content = vget($url);
		//获取星期列表
		//获取星期内的数据
		$rule = '/<div class="epghdc lt".*?<\/div>/';
		preg_match_all($rule,$week);
		$week = $week[0][0];
		//获取一周链接
		$rule = '/<dd.*?<\/dd>/';
		preg_match_all($rule,$week,$week);
		$week = $week[0];
		//去除 上周 的链接
		unset($week[0]);
		
		$forReturn = array();
		$i = 1;
		foreach($week as $value){
			if(8 == $i){
				break;
			}
			$forReturn[$i]['day'] = $i;
			//获取周一的链接
			$rule = '/href=".*?"/';
			preg_match_all($rule,$tempResult);
			$tempResult = $tempResult[0];
			if(empty($tempResult)){
				$link = $url;
				$forReturn[$i]['link'] = $link;
			}else{
				$link = substr($tempResult[0],strlen($tempResult[0]));
				$link = 'http://www.tvmao.com'.substr($link,'"'));
				$forReturn[$i]['link'] = $link;
				
			}
			//获取日期信息
			$rule = '/\(.*?\)/';
			preg_match_all($rule,$date);
			$date = $date[0][0];
			//去除括号
			$date = substr($date,1,strlen($date)-2);
			$date = explode('-',$date);
			$date = $date[0].$date[1];
			$forReturn[$i++]['date'] = $date;
		}
		
		return $forReturn;
	}
	//获取某天的节目列表
	public function getChildsList($url){
		$forReturn = array();
		
		$content = vget($url);
		$rule = '/<h1 style="float:left">.*?<\/h1>/';
		preg_match_all($rule,$title);
		$name = substr($title,stripos($title,'>')+1,strlen($title));//电视台名称
		$rule = '/<a href=.*?<\/a>/';
		$name = preg_replace($rule,$name);
		$forReturn['titleName'] = $name;
		
		//获取上午的节目
		$amResult = array();
		$rule = '/<li><span class="am">.*<\/li>/';
		preg_match_all($rule,$am);
		foreach($am[0] as $value){
			$amTemp = array();
			$amTemp['time'] = $this->getTimeAm($value);
			$amTemp['name'] = $this->getContent($value);
			$amTemp['link'] = $this->getLink($value);
			array_push($amResult,$amTemp);
		}
		$forReturn['am'] = $amResult;
		//获取下午的节目
		$pmResult = array();
		$rule = '/<li><span class="pm">.*<\/li>/';
		preg_match_all($rule,$pm);
		foreach($pm[0] as $value){
			$pmTemp = array();
			$pmTemp['time'] =  $this->getTimePm($value);
			$pmTemp['name'] =  $this->getContent($value);
			$pmTemp['link'] =  $this->getLink($value);
			array_push($pmResult,$pmTemp);
		}
		$forReturn['pm'] = $pmResult;
		//获取晚上的节目
		$ntResult = array();
		$rule = '/<li><span class="nt">.*<\/li>/';
		preg_match_all($rule,$nt);
		foreach($nt[0] as $value){
			$ntTemp = array();
			$ntTemp['time'] = $this->getTimeNt($value);
			$ntTemp['name'] = $this->getContent($value);
			$ntTemp['link'] = $this->getLink($value);
			array_push($ntResult,$ntTemp);
		}
		$forReturn['nt'] = $ntResult;
		return $forReturn;
	}
	//上午获取时间
	private function getTimeAm($content){
		$rule = '/<span class="am">.*<\/span>/';
		preg_match_all($rule,$time);
		$time = $time[0][0];
		$time = substr($time,17,strlen($time));
		$time = substr($time,stripos($time,'<'));
		return $time;
	}
	//下午获取时间
	private function getTimePm($content){
		$rule = '/<span class="pm">.*<\/span>/';
		preg_match_all($rule,'<'));
		return $time;
	}
	//晚上获取时间
	private function getTimeNt($content){
		$rule = '/<span class="nt">.*<\/span>/';
		preg_match_all($rule,'<'));
		return $time;
	}
	//获取内容
	private function getContent($content){
		$rule = '/<\/span>.*?<\/li>/';
		preg_match_all($rule,$name);
		$name = $name[0][0];
		//去除a标签
		$rule = '/<a title=.*?<\/a>/';
		$name = preg_replace($rule,$name);
		$rule = '/<a.*?>/';
		$name = preg_replace($rule,$name);
		$rule = '/<\/a>/';
		$name = preg_replace($rule,$name);
		$rule = '/<\/span> /';
		$name = preg_replace($rule,$name);
		$rule = '/\s*<\/li>/';
		$name = preg_replace($rule,$name);
		//去除div内容
		$rule = '/<div class="tvgd".*?<\/div>/';
		$name = preg_replace($rule,$name);
		$rule = '/<div class="tvcgd".*?<\/div>/';
		$name = preg_replace($rule,$name);
		return $name;
	}
	//获取链接
	private function getLink($content){
		$rule = '/<\/span>.*?<\/li>/';
		preg_match_all($rule,$name);
		$name = $name[0][0];
		
		//去除div内容
		$rule = '/<div class="tvgd".*?<\/div>/';
		$name = preg_replace($rule,$name);
		//去除不需要的内容
		$rule = '/<a title.*?<\/a>/';
		$name = preg_replace($rule,$name);
		//获取链接内容
		$rule = '/href=".*?"/';
		preg_match_all($rule,$result);
		$result = $result[0];
		
		if(empty($result)){
			return null;
		}else{
			$result = substr($result[0],strlen($result[0]));
			$result = substr($result,stripos($result,'"'));
			return 'http://www.tvmao.com'.$result;
		}
	}
	private function getChildInfo(){
		
	}
}
?>


这是一个单子的样品代码

可惜忙活了两天,最后单子还是丢了,

可惜,多好玩的项目。

猜你在找的正则表达式相关文章