本文实例讲述了PHP实现将HTML页面转换成word并且保存的方法。分享给大家供大家参考,具体如下:
生成Word的原理是,将堆规定好了的xml压缩成一个zip包,并且把后缀名改成doc或者docx即可。
所以使用PHPWord,需要你的PHP环境安装zip.dll压缩扩展,我写了一个demo.
功能说明:
20150507 — HTML中的
标签和
- 列表标签的获取
20150508 — 新增获取文章中的图片功能
20150509 — 新增行间距,并且过滤一下错误图片
20150514 — 新增表格处理,并且将代码改成面向对象
20150519 — 新增GD库处理网络图片
_Time();
$startMemory = $this->_memory();
$this->url = $url;
$UrlArr = parse_url($this->url);
$this->host = $UrlArr["scheme"]."://".$UrlArr['host'];
$this->CurrentDir = getcwd();
$this->LinetextArr["table"] = array();
$html = new simple_html_dom($this->url);
$this->HttpRequestArr[] = $this->url;
$this->HttpRequestTime++;
foreach($html->find($this->Allowtag) as $key=>$value)
{
if($value->tag == "table")
{
$this->ParseTable($value,$this->LinetextArr["table"]);
}
else
{
$this->AnalysisHtmlDom($value);
}
$this->error[] = error_get_last();
}
$endTime = $this->_Time();
$endMemory = $this->_memory();
$this->expendTime = round(($endTime-$startTime),2); //微秒
$this->expendmemory = round(($endMemory-$startMemory)/1000,2); //bytes
$this->CreateWordDom();
}
private function _Time()
{
return array_sum(explode(" ",microtime()));
}
private function _memory()
{
return memory_get_usage();
}
/**
* 解析HTML中的Table,这里考虑到多层table嵌套的情况
* @param $value HTMLDOM
* @param $i 遍历层级
* **/
private function ParseTable($value,$i,$Arr)
{
if($value->firstChild() && in_array($value->firstChild()->tag,array("table","tbody","thead","tfoot","tr")))
{
foreach($value->children as $k=>$v)
{
$this->ParseTable($v,$i++,$Arr);
}
}
else
{
foreach($value->children as $k=>$v)
{
if($v->firstChild() && $v->firstChild()->tag != "table")
{
$Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext));
}
if(!$v->firstChild())
{
$Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext));
}
}
}
}
/**
* 解析HTML里面的表情
* @param $value HTMLDOM
* **/
private function AnalysisHtmlDom($value)
{
$tmp = array();
if($value->has_child())
{
foreach($value->children as $k=>$v)
{
$this->AnalysisHtmlDom($v);
}
}
else
{
if($value->tag == "a")
{
$tmp = array("tag"=>$value->tag,"href"=>$value->href,"text"=>$value->innertext);
}
else if($value->tag == "img")
{
$src = $this->unescape($value->src);
$UrlArr = parse_url($src);
if(!isset($UrlArr['host']))
{
$src = $this->host.$value->src;
$UrlArr = parse_url($src);
}
$src = $this->getImageFromNet($src,$UrlArr); //表示有网络图片,需要下载
if($src)
{
$imgsArr = $this->GD($src);
$tmp = array("tag"=>$value->tag,"src"=>$src,"text"=>$value->alt,"width"=>$imgsArr['width'],"height"=>$imgsArr['height']); }
}
else
{
$tmp = array("tag"=>$value->tag,"text"=>strip_tags($value->innertext));
}
$this->LinetextArr[] = $tmp;
}
}
/**
* 根据GD库来获取图片的如果太多,进行比例压缩
* **/
private function GD($src)
{
list($width,$height,$type,$attr) = getimagesize($src);
if($width > 800 || $height > 800 )
{
$width = $width/2;
$height = $height/2;
}
return array("width"=>$width,"height"=>$height);
}
/**
* 将Uincode编码转移回原来的字符
* **/
public function unescape($str) {
$str = rawurldecode($str);
preg_match_all("/(?:%u.{4})|.{4};|\d+;|.+/U",$str,$r);
$ar = $r[0];
foreach($ar as $k=>$v) {
if(substr($v,2) == "%u"){
$ar[$k] = iconv("UCS-2BE","UTF-8",pack("H4",substr($v,-4)));
}
elseif(substr($v,3) == ""){
$ar[$k] = iconv("UCS-2BE",3,-1)));
}
elseif(substr($v,2) == ""){
$ar[$k] = iconv("UCS-2BE",pack("n",2,-1)));
}
}
return join("",$ar);
}
/**
* 图片下载
* @param $Src 目标资源
* @param $UrlArr 目标URL对应的数组
* **/
private function getImageFromNet($Src,$UrlArr)
{
$file = basename($UrlArr['path']);
$ext = explode('.',$file);
$this->ImgDir = $this->CurrentDir."/".$UrlArr['host'];
$_supportedImageTypes = array('jpg','jpeg','gif','png','bmp','tif','tiff');
if(isset($ext['1']) && in_array($ext['1'],$_supportedImageTypes))
{
$file = file_get_contents($Src);
$this->HttpRequestArr[] = $Src;
$this->HttpRequestTime++;
$this->_mkdir(); //创建目录,或者收集错误
$imgName = md5($UrlArr['path']).".".$ext['1'];
file_put_contents($this->ImgDir."/".$imgName,$file);
$this->DownImg++;
return $UrlArr['host']."/".$imgName;
}
return false;
}
/**
* 创建目录
* **/
private function _mkdir()
{
if(!is_dir($this->ImgDir))
{
if(!mkdir($this->ImgDir,"7777"))
{
$this->error[] = error_get_last();
}
}
}
/**
* 构造WordDom
* **/
private function CreateWordDom()
{
$PHPWord = new PHPWord();
$PHPWord->setDefaultFontName('宋体');
$PHPWord->setDefaultFontSize("11");
$styleTable = array('borderSize'=>6,'borderColor'=>'006699','cellMargin'=>120);
// New portrait section
$section = $PHPWord->createSection();
$section->addText($this->Details(),array(),array('spacing'=>120));
//数据进行处理
foreach($this->LinetextArr as $key=>$lineArr)
{
if(isset($lineArr['tag']))
{
if($lineArr['tag'] == "li")
{
$section->addListItem($lineArr['text'],"",array('spacing'=>120));
}
else if($lineArr['tag'] == "img")
{
$section->addImage($lineArr['src'],array('width'=>$lineArr['width'],'height'=>$lineArr['height'],'align'=>'center'));
}
else if($lineArr['tag'] == "p")
{
$section->addText($lineArr['text'],array('spacing'=>120));
}
}
else if($key == "table")
{
$PHPWord->addTableStyle('myOwnTableStyle',$styleTable);
$table = $section->addTable("myOwnTableStyle");
foreach($lineArr as $key=>$tr)
{
$table->addRow();
foreach($tr as $ky=>$td)
{
$table->addCell(2000)->addText($td['text']);
}
}
}
}
$this->downFile($PHPWord);
}
public function Details()
{
$msg = "一共请求:{$this->HttpRequestTime}次,共下载的图片有{$this->DownImg}张,并且下载完成大约使用时间:{$this->expendTime}秒,整个程序执行大约消耗内存是:{$this->expendmemory}KB,";
return $msg;
}
public function downFile($PHPWord)
{
if(empty($this->filename))
{
$UrlArr = parse_url($this->url);
$this->filename = $UrlArr['host'].".docx";
}
// Save File
$objWriter = PHPWord_IOFactory::createWriter($PHPWord,'Word2007');
$objWriter->save($this->filename);
header("Pragma: public");
header("Expires: 0");
header("Cache-Control: must-revalidate,post-check=0,pre-check=0");
header("Cache-Control: public");
header("Content-Description: File Transfer");
//Use the switch-generated Content-Type
header('Content-type: application/msword');//输出的类型
//Force the download
$header="Content-Disposition: attachment; filename=".$this->filename.";";
header($header);
@readfile($this->filename);
}
}
上面的代码重点感觉不是word生成,而是Simplehtmldom的使用,这是一个开源的HTML解析器,之前有提到,这几天在看他的代码,
引出了两个学习方向
① 正在表达式
② 这个扩展的函数整理
看源代码的收获:
PHP的异常是可以捕获的,而且PHP的错误也是可以捕获的。
更多关于PHP相关内容感兴趣的读者可查看本站专题:《》、《》、《》、《》、《》、《》、《》、《》、《》、《》及《》
希望本文所述对大家PHP程序设计有所帮助。