16 Haziran 2017 Cuma

c# crawler or spider source code

2017 crawler or spider source code
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
 
namespace CrawlerLib
{
    public class Crawler
    {
        string startUrl;
 
 
        /// <summary>
        /// Hole Response als HTML-String
        /// </summary>
        /// <param name="url">Übergeben Url</param>
        /// <param name="startUrl">Übergeben Start Url</param>
        /// <returns>HTML-String</returns>
        public string GetResponsetHtmlStr(string url, string startUrl)
        {
            string htmlStr = "";
            this.startUrl = startUrl;
 
            try
            {
                HttpWebRequest httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
                WebRequest webRequest = (WebRequest)httpWebRequest;
                webRequest.Proxy = null;
                WebResponse webResponse = webRequest.GetResponse();
                StreamReader sr = new StreamReader(webResponse.GetResponseStream());
                htmlStr = sr.ReadToEnd();
            }
            catch (WebException ex)
            {
                throw ex;
            }
            return htmlStr;
        }
        
 
        /// <summary>
        /// Durchsuche den HTML-String nach href-Links,
        /// füge diese einer Liste hinzu und gebe die Liste
        /// zurück
        /// </summary>
        /// <param name="htmlStr">HTML-String</param>
        /// <returns>Url-Liste</returns>
        public List<string> GetUrlList(string htmlStr)
        {
            string linkedUrl;
            List<string> urlList = new List<string>();
 
            Regex regexLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))");
          
            foreach (var match in regexLink.Matches(htmlStr))
            {
                if (!urlList.Contains(match.ToString()))
                {
                    linkedUrl = GetLinkedUrl(match.ToString());
                    urlList.Add(linkedUrl);
                }
            }
 
            return urlList;
        }
 
 
        /// <summary>
        /// Überprüfe ob die Url nur mit /..../irgendeineseite.html
        /// beginnt, wenn ja füge die Start-Url hinzu um später eine
        /// korrekte Response zu erhalten
        /// </summary>
        /// <param name="url">übergebene Url</param>
        /// <returns>Fertige Url</returns>
        private string GetLinkedUrl(string url)
        {
            if (!url.Contains("http://"))
            {
                if (url.IndexOf("/", 0) != -1)
                {
                    url = this.startUrl + url;
                }
                else
                {
                    url = this.startUrl + "/" + url;
                }
                
            }
 
            return url;
        }
 
 
        /// <summary>
        /// Gibt einen Link mit http:// zurück, sofern
        /// die Url kein http:// besitzt. Ansonsten
        /// funktioniert der Request nicht
        /// </summary>
        /// <param name="url">Die zu überprüfende Url</param>
        /// <returns>Fertige überprüfte Url</returns>
        public string GetCheckedUrl(string url)
        {
            if (!url.Contains("http://"))
            {
                if (!url.Contains("https://"))
                {
                    url = "http://" + url;
                }
                
            }
 
            return url;
        }
 
 
        /// <summary>
        /// Vergleiche Url Listen, damit nur neue Links hinzugefügt werden
        /// Keine doppelten Links
        /// </summary>
        /// <param name="urlList1">Alte Url-Liste</param>
        /// <param name="urlList2">Neue Url-Liste</param>
        /// <returns>Verglichene Liste mit allen neuen Links</returns>
        public List<string> CompareUrlInList(List<string> urlList1, List<string> urlList2)
        {          
            List<string> newComparedList = urlList2.Except(urlList1).ToList();
            return newComparedList;
        }        
    }
}