Javascript:REGEX将所有相对Urls更改为Absolute

前端之家收集整理的这篇文章主要介绍了Javascript:REGEX将所有相对Urls更改为Absolute前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
我目前正在创建一个Node.js的webscraper / proxy,但是我无法解析源码脚本部分中发现的相对Urls,我认为REGEX会做的.
虽然不知道我会如何实现这一点.

有没有我可以去这个?

此外,我也开放了一个更简单的方法,因为我对于其他代理如何解析网站非常困惑.我认为,大多数只是荣耀的网站刮刀,可以将网站的来源所有链接/表单的代理返回给代理.

解决方法

高级 HTML字符串替换功能

OP的注意事项是因为他要求这样一个功能:将base_url更改为代理的basE URL,以达到预期的结果.

下面将显示两个功能(使用指南包含在代码中).确保您不要跳过该答案的任何部分解释,以充分了解功能的行为.

> rel_to_abs(url) – 此函数返回绝对URL.当传递具有普遍信任协议的绝对URL时,它将立即返回此URL.否则,将从base_url和函数参数生成绝对URL.相对URL正确解析(../; ./;.; //).
> replace_all_rel_by_abs – 此功能将解析HTML中具有重要含义的所有URL,例如CSS url(),链接和外部资源.有关已解析实例的完整列表,请参阅代码.有关调整的实现,请参阅this answer,以从外部源清理HTML字符串(嵌入到文档中).
>测试用例(在答案的底部):要测试功能的有效性,只需将书签粘贴到位置的栏.

rel_to_abs – 解析相对URL

function rel_to_abs(url){
    /* Only accept commonly trusted protocols:
     * Only data-image URLs are accepted,Exotic flavours (escaped slash,* html-entitied characters) are not supported to keep the function fast */
  if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url))
         return url; //Url is already absolute

    var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/";
    if(url.substring(0,2) == "//")
        return location.protocol + url;
    else if(url.charAt(0) == "/")
        return location.protocol + "//" + location.host + url;
    else if(url.substring(0,2) == "./")
        url = "." + url;
    else if(/^\s*$/.test(url))
        return ""; //Empty = Return nothing
    else url = "../" + url;

    url = base_url + url;
    var i=0
    while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,"")));

    /* Escape certain characters to prevent XSS */
    url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22")
            .replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E");
    return url;
}

案例/示例:

> http://foo.bar.已经是一个绝对的URL,因此立即返回.
> / doo相对于根:返回当前根提供的相对URL.
> ./meh相对于当前目录.
> ../booh相对于父目录.

函数将相对路径转换为../,并执行搜索替换(http://domain/sub/anything-but-a-slash/../me到http:// domain / sub / me) .

replace_all_rel_by_abs – 转换所有相关的URL
脚本实例中的URL(< script>,事件处理程序)不被替换,因为创建快速安全的过滤器来解析JavaScript几乎是不可能的.

这个脚本里面有一些评论.正则表达式是动态创建的,因为单个RE可以具有3000个字符的大小. < Meta http-equiv = refresh content = ..>可以以各种方式混淆,因此RE的大小.

function replace_all_rel_by_abs(html){
    /*HTML/XML Attribute may not be prefixed by these characters (common 
       attribute chars.  This list is not complete,but will be sufficient
       for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */
    var att = "[^-a-z0-9:._]";

    var entityEnd = "(?:;|(?!\\d))";
    var ents = {" ":"(?:\\s|&nbsp;?|&#0*32"+entityEnd+"|&#x0*20"+entityEnd+")","(":"(?:\\(|&#0*40"+entityEnd+"|&#x0*28"+entityEnd+")",")":"(?:\\)|&#0*41"+entityEnd+"|&#x0*29"+entityEnd+")",".":"(?:\\.|&#0*46"+entityEnd+"|&#x0*2e"+entityEnd+")"};
                /* Placeholders to filter obfuscations */
    var charMap = {};
    var s = ents[" "]+"*"; //Short-hand for common use
    var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*";
    /* ^ Important: Must be pre- and postfixed by < and >.
     *   This RE should match anything within a tag!  */

    /*
      @name ae
      @description  Converts a given string in a sequence of the original
                      input and the HTML entity
      @param String string  String to convert
      */
    function ae(string){
        var all_chars_lowercase = string.toLowerCase();
        if(ents[string]) return ents[string];
        var all_chars_uppercase = string.toUpperCase();
        var RE_res = "";
        for(var i=0; i<string.length; i++){
            var char_lowercase = all_chars_lowercase.charAt(i);
            if(charMap[char_lowercase]){
                RE_res += charMap[char_lowercase];
                continue;
            }
            var char_uppercase = all_chars_uppercase.charAt(i);
            var RE_sub = [char_lowercase];
            RE_sub.push("&#0*" + char_lowercase.charCodeAt(0) + entityEnd);
            RE_sub.push("&#x0*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
            if(char_lowercase != char_uppercase){
                /* Note: RE ignorecase flag has already been activated */
                RE_sub.push("&#0*" + char_uppercase.charCodeAt(0) + entityEnd);   
                RE_sub.push("&#x0*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
            }
            RE_sub = "(?:" + RE_sub.join("|") + ")";
            RE_res += (charMap[char_lowercase] = RE_sub);
        }
        return(ents[string] = RE_res);
    }

    /*
      @name by
      @description  2nd argument for replace().
      */
    function by(match,group1,group2,group3){
        /* Note that this function can also be used to remove links:
         * return group1 + "javascript://" + group3; */
        return group1 + rel_to_abs(group2) + group3;
    }
    /*
      @name by2
      @description  2nd argument for replace(). Parses relevant HTML entities
      */
    var slashRE = new RegExp(ae("/"),'g');
    var dotRE = new RegExp(ae("."),'g');
    function by2(match,group3){
        /*Note that this function can also be used to remove links:
         * return group1 + "javascript://" + group3; */
        group2 = group2.replace(slashRE,"/").replace(dotRE,".");
        return group1 + rel_to_abs(group2) + group3;
    }
    /*
      @name cr
      @description            Selects a HTML element and performs a
                                search-and-replace on attributes
      @param String selector  HTML substring to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String marker    Optional RegExp-escaped; marks the prefix
      @param String delimiter Optional RegExp escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to end
                              before an occurence of <end>
     */
    function cr(selector,attribute,marker,delimiter,end){
        if(typeof selector == "string") selector = new RegExp(selector,"gi");
        attribute = att + attribute;
        marker = typeof marker == "string" ? marker : "\\s*=\\s*";
        delimiter = typeof delimiter == "string" ? delimiter : "";
        end = typeof end == "string" ? "?)("+end : ")(";
        var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')','gi');
        var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")",'gi');
        var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')','gi');
        html = html.replace(selector,function(match){
            return match.replace(re1,by).replace(re2,by).replace(re3,by);
        });
    }
    /* 
      @name cri
      @description            Selects an attribute of a HTML element,and
                                performs a search-and-replace on certain values
      @param String selector  HTML element to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String front     RegExp-escaped; attribute value,prefix to match
      @param String flags     Optional RegExp flags,default "gi"
      @param String delimiter Optional RegExp-escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to end
                                before an occurence of <end>
     */
    function cri(selector,front,flags,"gi");
        attribute = att + attribute;
        flags = typeof flags == "string" ? flags : "gi";
        var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)','gi');
        var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)",'gi');
        var at1 = new RegExp('('+front+')([^"]+)(")',flags);
        var at2 = new RegExp("("+front+")([^']+)(')",flags);
        if(typeof delimiter == "string"){
            end = typeof end == "string" ? end : "";
            var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"),flags);
            var handleAttr = function(match,g1,g2){return g1+g2.replace(at1,by2).replace(at2,by2).replace(at3,by2)};
        } else {
            var handleAttr = function(match,by2)};
    }
        html = html.replace(selector,function(match){
             return match.replace(re1,handleAttr).replace(re2,handleAttr);
        });
    }

    /* <Meta http-equiv=refresh content="  ; url= " > */
    cri("<Meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))","content",ae("url")+s+ae("=")+s,"i");

    cr("<"+any+att+"href\\s*="+any+">","href"); /* Linked elements */
    cr("<"+any+att+"src\\s*="+any+">","src"); /* Embedded elements */

    cr("<object"+any+att+"data\\s*="+any+">","data"); /* <object data= > */
    cr("<applet"+any+att+"codebase\\s*="+any+">","codebase"); /* <applet codebase= > */

    /* <param name=movie value= >*/
    cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))","value");

    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi,"url","\\s*\\(\\s*","","\\s*\\)"); /* <style> */
    cri("<"+any+att+"style\\s*="+any+">","style",ae("url")+s+ae("(")+s,s+ae(")"),ae(")")); /*< style=" url(...) " > */
    return html;
}

私人功能的简要摘要

> rel_to_abs(url) – 将相对/未知的URL转换为绝对URL
> replace_all_rel_by_abs(html) – 用绝对URL替换HTML字符串中所有URL的相关发生.

> ae – 任何实体 – 返回一个RE模式来处理HTML实体.
> by – replace by – 此短功能请求实际的url替换(rel_to_abs).这个功能可能被称为数百,如果不是千次.小心不要在此函数添加一个慢速算法(定制).
> cr – 创建替换 – 创建并执行search-and-replace.Example:href =“…”(在任何HTML标签内).
> cri – 创建替换内联 – 在HTML标签内的所有样式属性内创建并执行search-and-replace.Example:url(..).

测试用例

打开任何页面,并将以下bookmarklet粘贴到位置栏中:

javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})();

注入的代码包含如上定义的两个函数,加上测试用例,如下所示.注意:测试用例不会修改页面的HTML,但在textarea(可选)中显示已解析的结果.

var t=(new Date).getTime();
  var result = replace_all_rel_by_abs(document.documentElement.innerHTML);
  if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){
    var txt = document.createElement("textarea");
    txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%"
    txt.ondblclick = function(){this.parentNode.removeChild(this)}
    txt.value = result;
    document.body.appendChild(txt);
}

也可以看看:

> Answer: Parsing and sanitising HTML strings

猜你在找的JavaScript相关文章