php实现将HTML页面转换成word并且保存的方法
程序员文章站
2024-04-02 12:20:16
本文实例讲述了php实现将html页面转换成word并且保存的方法。分享给大家供大家参考,具体如下:
这里用使用到一个php的工具叫:phpword。
生成word的原...
本文实例讲述了php实现将html页面转换成word并且保存的方法。分享给大家供大家参考,具体如下:
这里用使用到一个php的工具叫:phpword。
生成word的原理是,将堆规定好了的xml压缩成一个zip包,并且把后缀名改成doc或者docx即可。
所以使用phpword,需要你的php环境安装zip.dll压缩扩展,我写了一个demo.
功能说明:
20150507 — html中的<p>标签和<ol>列表标签的获取
20150508 — 新增获取文章中的图片功能
20150509 — 新增行间距,并且过滤一下错误图片
20150514 — 新增表格处理,并且将代码改成面向对象
20150519 — 新增gd库处理网络图片
require_once 'phpword.php'; require_once 'simplehtmldom.class.php'; class word{ private $url; private $linetextarr = array(); public $currentdir; public $error = array(); //错误数组 public $filename = null; public $allowtag = "p,ol,ul,table"; /**数据统计**/ public $downimg = 0; public $expendtime = 0; public $httprequesttime = 0; public $contentlen = 0; public $httprequestarr = array(); public $expendmemory = 0; public function __construct($url) { $starttime = $this->_time(); $startmemory = $this->_memory(); $this->url = $url; $urlarr = parse_url($this->url); $this->host = $urlarr["scheme"]."://".$urlarr['host']; $this->currentdir = getcwd(); $this->linetextarr["table"] = array(); $html = new simple_html_dom($this->url); $this->httprequestarr[] = $this->url; $this->httprequesttime++; foreach($html->find($this->allowtag) as $key=>$value) { if($value->tag == "table") { $this->parsetable($value,0,$this->linetextarr["table"]); } else { $this->analysishtmldom($value); } $this->error[] = error_get_last(); } $endtime = $this->_time(); $endmemory = $this->_memory(); $this->expendtime = round(($endtime-$starttime),2); //微秒 $this->expendmemory = round(($endmemory-$startmemory)/1000,2); //bytes $this->createworddom(); } private function _time() { return array_sum(explode(" ", microtime())); } private function _memory() { return memory_get_usage(); } /** * 解析html中的table,这里考虑到多层table嵌套的情况 * @param $value htmldom * @param $i 遍历层级 * **/ private function parsetable($value,$i,$arr) { if($value->firstchild() && in_array($value->firstchild()->tag,array("table","tbody","thead","tfoot","tr"))) { foreach($value->children as $k=>$v) { $this->parsetable($v,$i++,$arr); } } else { foreach($value->children as $k=>$v) { if($v->firstchild() && $v->firstchild()->tag != "table") { $arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext)); } if(!$v->firstchild()) { $arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext)); } } } } /** * 解析html里面的表情 * @param $value htmldom * **/ private function analysishtmldom($value) { $tmp = array(); if($value->has_child()) { foreach($value->children as $k=>$v) { $this->analysishtmldom($v); } } else { if($value->tag == "a") { $tmp = array("tag"=>$value->tag,"href"=>$value->href,"text"=>$value->innertext); } else if($value->tag == "img") { $src = $this->unescape($value->src); $urlarr = parse_url($src); if(!isset($urlarr['host'])) { $src = $this->host.$value->src; $urlarr = parse_url($src); } $src = $this->getimagefromnet($src,$urlarr); //表示有网络图片,需要下载 if($src) { $imgsarr = $this->gd($src); $tmp = array("tag"=>$value->tag,"src"=>$src,"text"=>$value->alt,"width"=>$imgsarr['width'],"height"=>$imgsarr['height']); } } else { $tmp = array("tag"=>$value->tag,"text"=>strip_tags($value->innertext)); } $this->linetextarr[] = $tmp; } } /** * 根据gd库来获取图片的如果太多,进行比例压缩 * **/ private function gd($src) { list($width, $height, $type, $attr) = getimagesize($src); if($width > 800 || $height > 800 ) { $width = $width/2; $height = $height/2; } return array("width"=>$width,"height"=>$height); } /** * 将uincode编码转移回原来的字符 * **/ public function unescape($str) { $str = rawurldecode($str); preg_match_all("/(?:%u.{4})|.{4};|\d+;|.+/u",$str,$r); $ar = $r[0]; foreach($ar as $k=>$v) { if(substr($v,0,2) == "%u"){ $ar[$k] = iconv("ucs-2be","utf-8",pack("h4",substr($v,-4))); } elseif(substr($v,0,3) == ""){ $ar[$k] = iconv("ucs-2be","utf-8",pack("h4",substr($v,3,-1))); } elseif(substr($v,0,2) == ""){ $ar[$k] = iconv("ucs-2be","utf-8",pack("n",substr($v,2,-1))); } } return join("",$ar); } /** * 图片下载 * @param $src 目标资源 * @param $urlarr 目标url对应的数组 * **/ private function getimagefromnet($src,$urlarr) { $file = basename($urlarr['path']); $ext = explode('.',$file); $this->imgdir = $this->currentdir."/".$urlarr['host']; $_supportedimagetypes = array('jpg', 'jpeg', 'gif', 'png', 'bmp', 'tif', 'tiff'); if(isset($ext['1']) && in_array($ext['1'],$_supportedimagetypes)) { $file = file_get_contents($src); $this->httprequestarr[] = $src; $this->httprequesttime++; $this->_mkdir(); //创建目录,或者收集错误 $imgname = md5($urlarr['path']).".".$ext['1']; file_put_contents($this->imgdir."/".$imgname,$file); $this->downimg++; return $urlarr['host']."/".$imgname; } return false; } /** * 创建目录 * **/ private function _mkdir() { if(!is_dir($this->imgdir)) { if(!mkdir($this->imgdir,"7777")) { $this->error[] = error_get_last(); } } } /** * 构造worddom * **/ private function createworddom() { $phpword = new phpword(); $phpword->setdefaultfontname('宋体'); $phpword->setdefaultfontsize("11"); $styletable = array('bordersize'=>6, 'bordercolor'=>'006699', 'cellmargin'=>120); // new portrait section $section = $phpword->createsection(); $section->addtext($this->details(),array(),array('spacing'=>120)); //数据进行处理 foreach($this->linetextarr as $key=>$linearr) { if(isset($linearr['tag'])) { if($linearr['tag'] == "li") { $section->addlistitem($linearr['text'],0,"","",array('spacing'=>120)); } else if($linearr['tag'] == "img") { $section->addimage($linearr['src'],array('width'=>$linearr['width'], 'height'=>$linearr['height'], 'align'=>'center')); } else if($linearr['tag'] == "p") { $section->addtext($linearr['text'],array(),array('spacing'=>120)); } } else if($key == "table") { $phpword->addtablestyle('myowntablestyle', $styletable); $table = $section->addtable("myowntablestyle"); foreach($linearr as $key=>$tr) { $table->addrow(); foreach($tr as $ky=>$td) { $table->addcell(2000)->addtext($td['text']); } } } } $this->downfile($phpword); } public function details() { $msg = "一共请求:{$this->httprequesttime}次,共下载的图片有{$this->downimg}张,并且下载完成大约使用时间:{$this->expendtime}秒,整个程序执行大约消耗内存是:{$this->expendmemory}kb,"; return $msg; } public function downfile($phpword) { if(empty($this->filename)) { $urlarr = parse_url($this->url); $this->filename = $urlarr['host'].".docx"; } // save file $objwriter = phpword_iofactory::createwriter($phpword, 'word2007'); $objwriter->save($this->filename); header("pragma: public"); header("expires: 0"); header("cache-control: must-revalidate, post-check=0, pre-check=0"); header("cache-control: public"); header("content-description: file transfer"); //use the switch-generated content-type header('content-type: application/msword');//输出的类型 //force the download $header="content-disposition: attachment; filename=".$this->filename.";"; header($header); @readfile($this->filename); } }
上面的代码重点感觉不是word生成,而是simplehtmldom的使用,这是一个开源的html解析器,之前有提到,这几天在看他的代码,
引出了两个学习方向
① 正在表达式
② 这个扩展的函数整理
看源代码的收获:
php的异常是可以捕获的,而且php的错误也是可以捕获的。
error_get_last() //用这个函数可以捕获页面中的php错误,不谢。
更多关于php相关内容感兴趣的读者可查看本站专题:《php操作office文档技巧总结(包括word,excel,access,ppt)》、《php数组(array)操作技巧大全》、《php排序算法总结》、《php常用遍历算法与技巧总结》、《php数据结构与算法教程》、《php程序设计算法总结》、《php数学运算技巧总结》、《php正则表达式用法总结》、《php运算与运算符用法总结》、《php字符串(string)用法总结》及《php常见数据库操作技巧汇总》
希望本文所述对大家php程序设计有所帮助。