使用浏览器库的临时运行浏览器或独立实用程序的桥接/代理是一种可接受的解决方案(但是,所选的浏览器或浏览器库必须在所有主要平台上都可用,并且必须能够在没有OS GUI的情况下运行出席或安装).
一个可选的要求是之后删除所有脚本(已经有了独立的解决方案,在这里添加它,因为可能给定的答案将能够在渲染时删除脚本或类似的东西).
如何在当前样式(可能是内联的)和当前图像(使用data URI)的单个.html文件中获取HTML CSS中的快照?
如果它可以使用纯PHP来完成它将是一个加号(虽然我怀疑它,我没有发现任何有趣的东西).
编辑:我知道如何加载HTTP资源并获取URL的HTML,这不是我正在寻找的;)
编辑2
示例输入HTML:
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
- <html>
- <head>
- <title></title>
- <Meta http-equiv="Content-Type" content="text/html;charset=utf-8">
- <link rel="stylesheet" type="text/css" href="/css/example.css">
- <script type="text/javascript" src="/javascript/example.js"></script>
- <script type="text/javascript">
- window.addEventListener("load",function(event){
- document.title="New title";
- document.getElementById("pic_0").style.border="0px";
- }
- );
- </script>
- <style type="text/css">
- p{
- color: blue;
- }
- </style>
- </head>
- <body>
- <p>Hello world!</p>
- <p>
- <img
- alt=""
- style="border: 1px"
- id="pic_0"
- src="http://linuxgazette.net/144/misc/john/helloworld.png"
- >
- </p>
- </body>
- </html>
输出示例:
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
- <html>
- <head>
- <title>New title</title>
- <Meta http-equiv="Content-Type" content="text/html;charset=utf-8">
- <style type="text/css">
- b{font-weight: bold}
- </style>
- <style type="text/css">
- p{
- color: blue;
- }
- </style>
- </head>
- <body>
- <p>Hello world!</p>
- <p>
- <img
- alt=""
- style="border: 0px"
- id="pic_0"
- src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACgAAAAoBAMAAAB+0KVeAAAAK3RFWHRDcmVhdGlvbiBUaW1lAFYgMzEgYXVnLiAyMDEyIDE3OjU4OjU1ICswMjAwWMdbPwAAAAd0SU1FB9wIHw8ABeoUyU4AAAAJcEhZcwAACxIAAAsSAdLdfvwAAAAEZ0FNQQAAsY8L/GEFAAAABlBMVEX///8AAABVwtN+AAAAXklEQVR42uWQUQ6AMAhD6Q3a+19WqsawwMf+NLEfy3iDlC7idTGQp/YglFAsUMqSwjlQOhN3mIMTHDq70SeEWBbt0EG8POWkDySvmCh/SssvNfwIfb+hFmgjFKPf6gDQBAQ368m09AAAAABJRU5ErkJggg=="
- >
- </p>
- </body>
- </html>
请注意< title>标签已更改,边框如何:1px变为border:0px,图像URL如何转换为data URI.
例如,在使用Google Chrome检查器检查文档时,可以观察到其中一些转换(内联CSS和< title>标记).
编辑3:用页面替换外部资源(样式和图像)并删除javascript是一个简单的部分.困难的部分是在运行javascript后计算CSS样式.
编辑4也许这可以使用注入的javascript(仍然需要浏览器控制)来完成?
它在所有主要平台上运行,正如我在我的问题中所要求的那样.
它可以运行Javascript脚本来控制无GUI的Web浏览器.它有一个强大的API,还有很多很多例子.
在过去2-3天的业余时间里,我为我的问题编写了解决方案,它完美地涵盖了所有要求.我还没有找到一个不起作用的网页.
.
用法,命令行:
- phantomjs save_as_html.js https://stackoverflow.com/q/12215844/584490 saved.html
.
允许Javascript在其他所有内容加载后运行n秒,它甚至可以用于完全由javascript生成的网页.
.
笔记:
>在可能的情况下,XHR加载资源优先于HTML5的画布渲染,因为文件大小减小,质量损失(重用原始文件比任何东西都要好).
>< link>和< img>标签保持在原位,数据:URI分别在href和src属性中使用,而不是URL.对于background-image也是如此,它在所有DOM节点上使用getComputedStyle()读取.
>< script>标记和事件处理程序属性已删除.
>< link> rel =“alternative”的标签也被删除(也许它们不应该被删除,而是固定在绝对URL中,如果是相对的).
>< iframe>目前尚未处理,其src属性设置为about:blank.
.
请注意解除所有跨站点脚本安全限制,以便可以加载所有资源.确保在使用Facebook帐户的某些秘密凭据时不要尝试保存恶意网页:).
.
save_as_html.js内容:
- //https://stackoverflow.com/a/12256190/584490
- var page = require('webpage').create();
- page.onConsoleMessage = function (msg) { console.log(msg); };
- var system = require('system');
- var address,output,size;
- if (system.args.length!=3)
- {
- console.log('Usage: save_as_html.js URL filename');
- phantom.exit(1);
- }
- else
- {
- address = system.args[1];
- output = system.args[2];
- page.viewportSize = {
- width: 1680,height: 1050,};
- //SECURITY_ERR: DOM Exception 18: An attempt was made to break through the security policy of the user agent.
- //Enable cross site scripting:
- page.settings.XSSAuditingEnabled=false;
- page.settings.localToRemoteUrlAccessEnabled=true;
- page.settings.webSecurityEnabled=false;
- page.settings.userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML,like Gecko) Chrome/22.0.1207.1 Safari/537.1";
- page.settings.ignoreSslErrors=true;
- page.open(address,function (status){
- if (status!=='success')
- {
- console.log("Unable to load URL,returned status: "+status);
- phantom.exit(1);
- }
- else
- {
- window.setTimeout(function (){
- page.evaluate(function(){
- var nodeList=document.getElementsByTagName("*");
- var arrEventHandlerAttributes=[
- "onblur","onchange","onclick","ondblclick","onfocus","onkeydown","onkeyup","onkeypress","onload","onmousedown","onmousemove","onmouSEOut","onmouSEOver","onmouseup","onreset","onselect","onsubmit","onunload"
- ];
- //https://stackoverflow.com/a/7372816/584490
- var base64Encode=function(str)
- {
- var CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
- var out = "",i = 0,len = str.length,c1,c2,c3;
- while (i < len) {
- c1 = str.charCodeAt(i++) & 0xff;
- if (i == len) {
- out += CHARS.charAt(c1 >> 2);
- out += CHARS.charAt((c1 & 0x3) << 4);
- out += "==";
- break;
- }
- c2 = str.charCodeAt(i++);
- if (i == len) {
- out += CHARS.charAt(c1 >> 2);
- out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
- out += CHARS.charAt((c2 & 0xF) << 2);
- out += "=";
- break;
- }
- c3 = str.charCodeAt(i++);
- out += CHARS.charAt(c1 >> 2);
- out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
- out += CHARS.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));
- out += CHARS.charAt(c3 & 0x3F);
- }
- return out;
- };
- for(var n=nodeList.length-1; n>0; n--)
- {
- try
- {
- var el=nodeList[n];
- if(el.nodeName=="IMG" && el.src.substr(0,5)!="data:")
- {
- /*var canvas=document.createElement("canvas");
- canvas.width=parseInt(el.width);
- canvas.height=parseInt(el.height);
- var ctx=canvas.getContext("2d");
- ctx.drawImage(el,0);
- el.src=canvas.toDataURL();*/
- var xhr=new XMLHttpRequest();
- xhr.open(
- "get",el.src,/*Asynchronous*/ false
- );
- xhr.overrideMimeType("text/plain; charset=x-user-defined");
- xhr.send(null);
- var strResponseContentType=xhr.getResponseHeader("Content-type").split(";")[0].replace(/[^a-z0-9\/-]/gi,"");
- el.src="data:"+strResponseContentType+";base64,"+base64Encode(xhr.responseText);
- }
- else if(el.nodeName=="LINK")
- {
- if(el.rel=="alternate")
- {
- el.parentNode.removeChild(el);
- }
- else if(el.href.substr(0,5)!="data:")
- {
- var xhr=new XMLHttpRequest();
- xhr.open(
- "get",el.href,/*Asynchronous*/ false
- );
- xhr.overrideMimeType("text/plain; charset=x-user-defined");
- xhr.send(null);
- //var strResponseContentType=xhr.getResponseHeader("Content-type").split(";")[0].replace(/[^a-z0-9\/-]/gi,"");
- //el.href="data:"+strResponseContentType+";base64,"+base64Encode(xhr.responseText);
- el.href="data:"+el.type+";base64,"+base64Encode(xhr.responseText);
- }
- continue;
- }
- else if(el.nodeName=="SCRIPT")
- {
- el.parentNode.removeChild(el);
- continue;
- }
- else if(el.nodeName=="IFRAME")
- {
- el.src="about:blank";
- continue;
- }
- for(var z=arrEventHandlerAttributes.length-1; z>=0; z--)
- el.removeAttribute(arrEventHandlerAttributes[z]);
- var strBackgroundImageURL=window.getComputedStyle(el).getPropertyValue("background-image").replace("/[\s]/g","");
- if(strBackgroundImageURL.substr(0,4)=="url(" && strBackgroundImageURL.substr(4,5)!="data:")
- {
- strBackgroundImageURL=strBackgroundImageURL.substr(4,strBackgroundImageURL.length-5);
- /*var imageTemp=document.createElement("img");
- imageTemp.src=strBackgroundImageURL;
- imageTemp.onload=function(e){
- var canvas=document.createElement("canvas");
- canvas.width=parseInt(imageTemp.width);
- canvas.height=parseInt(imageTemp.height);
- var ctx=canvas.getContext("2d");
- ctx.drawImage(imageTemp,0);
- el.style.backgroundImage="url("+canvas.toDataURL()+")";
- };
- if (imageTemp.complete)
- imageTemp.onload();
- */
- var xhr=new XMLHttpRequest();
- xhr.open(
- "get",strBackgroundImageURL,"");
- el.style.backgroundImage="url("+"data:"+strResponseContentType+";base64,"+base64Encode(xhr.responseText)+")";
- }
- if(el.nodeName=="A")
- {
- el.href="#";//TODO convert relative paths to absolute ones (keep URLs);
- el.setAttribute("onclick","return false;");//TODO: remove this when the above is fixed.
- }
- else if(el.nodeName=="FORM")
- {
- el.setAttribute("action","");
- el.setAttribute("onsubmit","return false;");
- }
- }
- catch(error)
- {
- //what can be done about it?
- }
- }
- });
- require("fs").write(output,page.content,"w");
- phantom.exit();
- },1000);
- }
- });
- }