c# – 从互联网下载HTML后,字符串中的字符已更改

前端之家收集整理的这篇文章主要介绍了c# – 从互联网下载HTML后,字符串中的字符已更改前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
使用以下代码,我可以从互联网下载文件HTML
WebClient wc = new WebClient();

// ....

string downloadedFile = wc.DownloadString("http://www.myurl.com/");

但是,有时该文件包含“有趣”的字符,如é至é,←到â†和フシギダネ到フã,·ã,®ãƒ€ãƒ.

我认为这可能与不同的unicode类型或某些东西有关,因为每个角色都会变成2个新角色,也许每一个角色分为两部分,但我在这方面的知识很少.你觉得是错的?

解决方法

这是一个包装的下载类,它支持gzip并检查编码头和元标记,以正确解码它.

实例化类,并调用GetPage().

public class HttpDownloader
{
    private readonly string _referer;
    private readonly string _userAgent;

    public Encoding Encoding { get; set; }
    public WebHeaderCollection Headers { get; set; }
    public Uri Url { get; set; }

    public HttpDownloader(string url,string referer,string userAgent)
    {
        Encoding = Encoding.GetEncoding("ISO-8859-1");
        Url = new Uri(url); // verify the uri
        _userAgent = userAgent;
        _referer = referer;
    }

    public string GetPage()
    {
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
        if (!string.IsNullOrEmpty(_referer))
            request.Referer = _referer;
        if (!string.IsNullOrEmpty(_userAgent))
            request.UserAgent = _userAgent;

        request.Headers.Add(HttpRequestHeader.AcceptEncoding,"gzip,deflate");

        using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
        {
            Headers = response.Headers;
            Url = response.ResponseUri;
            return ProcessContent(response);
        }

    }

    private string ProcessContent(HttpWebResponse response)
    {
        SetEncodingFromHeader(response);

        Stream s = response.GetResponseStream();
        if (response.ContentEncoding.ToLower().Contains("gzip"))
            s = new GZipStream(s,CompressionMode.Decompress);
        else if (response.ContentEncoding.ToLower().Contains("deflate"))
            s = new DeflateStream(s,CompressionMode.Decompress);  

        MemoryStream memStream = new MemoryStream();
        int bytesRead;
        byte[] buffer = new byte[0x1000];
        for (bytesRead = s.Read(buffer,buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer,buffer.Length))
        {
            memStream.Write(buffer,bytesRead);
        }
        s.Close();
        string html;
        memStream.Position = 0;
        using (StreamReader r = new StreamReader(memStream,Encoding))
        {
            html = r.ReadToEnd().Trim();
            html = CheckMetaCharSetAndReEncode(memStream,html);
        }            

        return html;
    }

    private void SetEncodingFromHeader(HttpWebResponse response)
    {
        string charset = null;
        if (string.IsNullOrEmpty(response.CharacterSet))
        {
            Match m = Regex.Match(response.ContentType,@";\s*charset\s*=\s*(?<charset>.*)",RegexOptions.IgnoreCase);
            if (m.Success)
            {
                charset = m.Groups["charset"].Value.Trim(new[] { '\'','"' });
            }
        }
        else
        {
            charset = response.CharacterSet;
        }
        if (!string.IsNullOrEmpty(charset))
        {
            try
            {
                Encoding = Encoding.GetEncoding(charset);
            }
            catch (ArgumentException)
            {
            }
        }
    }

    private string CheckMetaCharSetAndReEncode(Stream memStream,string html)
    {
        Match m = new Regex(@"<Meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)",RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
        if (m.Success)
        {
            string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1";
            if ((charset == "unicode") || (charset == "utf-16"))
            {
                charset = "utf-8";
            }

            try
            {
                Encoding MetaEncoding = Encoding.GetEncoding(charset);
                if (Encoding != MetaEncoding)
                {
                    memStream.Position = 0L;
                    StreamReader recodeReader = new StreamReader(memStream,MetaEncoding);
                    html = recodeReader.ReadToEnd().Trim();
                    recodeReader.Close();
                }
            }
            catch (ArgumentException)
            {
            }
        }

        return html;
    }
}
原文链接:https://www.f2er.com/csharp/96751.html

猜你在找的C#相关文章