虽然不知道我会如何实现这一点.
有没有我可以去这个?
此外,我也开放了一个更简单的方法,因为我对于其他代理如何解析网站非常困惑.我认为,大多数只是荣耀的网站刮刀,可以将网站的来源所有链接/表单的代理返回给代理.
解决方法
OP的注意事项是因为他要求这样一个功能:将base_url更改为代理的basE URL,以达到预期的结果.
下面将显示两个功能(使用指南包含在代码中).确保您不要跳过该答案的任何部分解释,以充分了解功能的行为.
> rel_to_abs(url) – 此函数返回绝对URL.当传递具有普遍信任协议的绝对URL时,它将立即返回此URL.否则,将从base_url和函数参数生成绝对URL.相对URL正确解析(../; ./;.; //).
> replace_all_rel_by_abs – 此功能将解析HTML中具有重要含义的所有URL,例如CSS url(),链接和外部资源.有关已解析实例的完整列表,请参阅代码.有关调整的实现,请参阅this answer,以从外部源清理HTML字符串(嵌入到文档中).
>测试用例(在答案的底部):要测试功能的有效性,只需将书签粘贴到位置的栏.
rel_to_abs – 解析相对URL
function rel_to_abs(url){ /* Only accept commonly trusted protocols: * Only data-image URLs are accepted,Exotic flavours (escaped slash,* html-entitied characters) are not supported to keep the function fast */ if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) return url; //Url is already absolute var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/"; if(url.substring(0,2) == "//") return location.protocol + url; else if(url.charAt(0) == "/") return location.protocol + "//" + location.host + url; else if(url.substring(0,2) == "./") url = "." + url; else if(/^\s*$/.test(url)) return ""; //Empty = Return nothing else url = "../" + url; url = base_url + url; var i=0 while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,""))); /* Escape certain characters to prevent XSS */ url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22") .replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E"); return url; }
案例/示例:
> http://foo.bar.已经是一个绝对的URL,因此立即返回.
> / doo相对于根:返回当前根提供的相对URL.
> ./meh相对于当前目录.
> ../booh相对于父目录.
该函数将相对路径转换为../,并执行搜索替换(http://domain/sub/anything-but-a-slash/../me到http:// domain / sub / me) .
replace_all_rel_by_abs – 转换所有相关的URL
脚本实例中的URL(< script>,事件处理程序)不被替换,因为创建快速安全的过滤器来解析JavaScript几乎是不可能的.
这个脚本里面有一些评论.正则表达式是动态创建的,因为单个RE可以具有3000个字符的大小. < Meta http-equiv = refresh content = ..>可以以各种方式混淆,因此RE的大小.
function replace_all_rel_by_abs(html){ /*HTML/XML Attribute may not be prefixed by these characters (common attribute chars. This list is not complete,but will be sufficient for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */ var att = "[^-a-z0-9:._]"; var entityEnd = "(?:;|(?!\\d))"; var ents = {" ":"(?:\\s| ?|�*32"+entityEnd+"|�*20"+entityEnd+")","(":"(?:\\(|�*40"+entityEnd+"|�*28"+entityEnd+")",")":"(?:\\)|�*41"+entityEnd+"|�*29"+entityEnd+")",".":"(?:\\.|�*46"+entityEnd+"|�*2e"+entityEnd+")"}; /* Placeholders to filter obfuscations */ var charMap = {}; var s = ents[" "]+"*"; //Short-hand for common use var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*"; /* ^ Important: Must be pre- and postfixed by < and >. * This RE should match anything within a tag! */ /* @name ae @description Converts a given string in a sequence of the original input and the HTML entity @param String string String to convert */ function ae(string){ var all_chars_lowercase = string.toLowerCase(); if(ents[string]) return ents[string]; var all_chars_uppercase = string.toUpperCase(); var RE_res = ""; for(var i=0; i<string.length; i++){ var char_lowercase = all_chars_lowercase.charAt(i); if(charMap[char_lowercase]){ RE_res += charMap[char_lowercase]; continue; } var char_uppercase = all_chars_uppercase.charAt(i); var RE_sub = [char_lowercase]; RE_sub.push("�*" + char_lowercase.charCodeAt(0) + entityEnd); RE_sub.push("�*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd); if(char_lowercase != char_uppercase){ /* Note: RE ignorecase flag has already been activated */ RE_sub.push("�*" + char_uppercase.charCodeAt(0) + entityEnd); RE_sub.push("�*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd); } RE_sub = "(?:" + RE_sub.join("|") + ")"; RE_res += (charMap[char_lowercase] = RE_sub); } return(ents[string] = RE_res); } /* @name by @description 2nd argument for replace(). */ function by(match,group1,group2,group3){ /* Note that this function can also be used to remove links: * return group1 + "javascript://" + group3; */ return group1 + rel_to_abs(group2) + group3; } /* @name by2 @description 2nd argument for replace(). Parses relevant HTML entities */ var slashRE = new RegExp(ae("/"),'g'); var dotRE = new RegExp(ae("."),'g'); function by2(match,group3){ /*Note that this function can also be used to remove links: * return group1 + "javascript://" + group3; */ group2 = group2.replace(slashRE,"/").replace(dotRE,"."); return group1 + rel_to_abs(group2) + group3; } /* @name cr @description Selects a HTML element and performs a search-and-replace on attributes @param String selector HTML substring to match @param String attribute RegExp-escaped; HTML element attribute to match @param String marker Optional RegExp-escaped; marks the prefix @param String delimiter Optional RegExp escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> */ function cr(selector,attribute,marker,delimiter,end){ if(typeof selector == "string") selector = new RegExp(selector,"gi"); attribute = att + attribute; marker = typeof marker == "string" ? marker : "\\s*=\\s*"; delimiter = typeof delimiter == "string" ? delimiter : ""; end = typeof end == "string" ? "?)("+end : ")("; var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')','gi'); var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")",'gi'); var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')','gi'); html = html.replace(selector,function(match){ return match.replace(re1,by).replace(re2,by).replace(re3,by); }); } /* @name cri @description Selects an attribute of a HTML element,and performs a search-and-replace on certain values @param String selector HTML element to match @param String attribute RegExp-escaped; HTML element attribute to match @param String front RegExp-escaped; attribute value,prefix to match @param String flags Optional RegExp flags,default "gi" @param String delimiter Optional RegExp-escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> */ function cri(selector,front,flags,"gi"); attribute = att + attribute; flags = typeof flags == "string" ? flags : "gi"; var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)','gi'); var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)",'gi'); var at1 = new RegExp('('+front+')([^"]+)(")',flags); var at2 = new RegExp("("+front+")([^']+)(')",flags); if(typeof delimiter == "string"){ end = typeof end == "string" ? end : ""; var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"),flags); var handleAttr = function(match,g1,g2){return g1+g2.replace(at1,by2).replace(at2,by2).replace(at3,by2)}; } else { var handleAttr = function(match,by2)}; } html = html.replace(selector,function(match){ return match.replace(re1,handleAttr).replace(re2,handleAttr); }); } /* <Meta http-equiv=refresh content=" ; url= " > */ cri("<Meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))","content",ae("url")+s+ae("=")+s,"i"); cr("<"+any+att+"href\\s*="+any+">","href"); /* Linked elements */ cr("<"+any+att+"src\\s*="+any+">","src"); /* Embedded elements */ cr("<object"+any+att+"data\\s*="+any+">","data"); /* <object data= > */ cr("<applet"+any+att+"codebase\\s*="+any+">","codebase"); /* <applet codebase= > */ /* <param name=movie value= >*/ cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))","value"); cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi,"url","\\s*\\(\\s*","","\\s*\\)"); /* <style> */ cri("<"+any+att+"style\\s*="+any+">","style",ae("url")+s+ae("(")+s,s+ae(")"),ae(")")); /*< style=" url(...) " > */ return html; }
> rel_to_abs(url) – 将相对/未知的URL转换为绝对URL
> replace_all_rel_by_abs(html) – 用绝对URL替换HTML字符串中所有URL的相关发生.
> ae – 任何实体 – 返回一个RE模式来处理HTML实体.
> by – replace by – 此短功能请求实际的url替换(rel_to_abs).这个功能可能被称为数百,如果不是千次.小心不要在此函数中添加一个慢速算法(定制).
> cr – 创建替换 – 创建并执行search-and-replace.Example:href =“…”(在任何HTML标签内).
> cri – 创建替换内联 – 在HTML标签内的所有样式属性内创建并执行search-and-replace.Example:url(..).
测试用例
打开任何页面,并将以下bookmarklet粘贴到位置栏中:
javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})();
注入的代码包含如上定义的两个函数,加上测试用例,如下所示.注意:测试用例不会修改页面的HTML,但在textarea(可选)中显示已解析的结果.
var t=(new Date).getTime(); var result = replace_all_rel_by_abs(document.documentElement.innerHTML); if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){ var txt = document.createElement("textarea"); txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%" txt.ondblclick = function(){this.parentNode.removeChild(this)} txt.value = result; document.body.appendChild(txt); }
也可以看看: