3 Ekim 2014 Cuma

c# crawler

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Tool;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading;

namespace Search
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        /**
        * Queue, save will access the URL
        */
        public class Queue
        {
            //Use list queue
            private LinkedList<string> queue = new LinkedList<string>();
            //The queue
            public void enQueue(string t)
            {
                queue.AddLast(t);
            }
            //The queue
            public string deQueue()
            {
                string o = queue.Last.Value;
                 queue.RemoveLast();
                 return o;

            }
            //To judge whether the queue is empty
            public bool isQueueEmpty()
            {
                return queue.Count > 0 ? false : true;
            }
            //Judge whether the queue contains T
            public bool contians(string t)
            {
                return queue.Contains(t);
            }
            public int getcount()
            {
                return queue.Count;
            }
        }
        public class LinkQueue
        {
            //Have access to the URL collection
            private static ISet<string> visitedUrl = new HashSet<string>();
            //To visit the URL collection
            private static Queue unVisitedUrl = new Queue();
            //URL queue
            public static Queue getUnVisitedUrl()
            {
                return unVisitedUrl;
            }
            //Add to queue URL visited in
            public static void addVisitedUrl(String url)
            {
                visitedUrl.Add(url);
            }
            //Remove access over URL
            public static void removeVisitedUrl(String url)
            {
                visitedUrl.Remove(url);
            }
            //No access to the URL queue
            public static Object unVisitedUrlDeQueue()
            {
                return unVisitedUrl.deQueue();
            }
            // To ensure that each URL is visited only once
            public static void addUnvisitedUrl(String url)
            {
                if (url != null && !url.Trim().Equals("")
                && !visitedUrl.Contains(url)
                && !unVisitedUrl.contians(url))
                    unVisitedUrl.enQueue(url);
            }
            //Get the number URL has access to
            public static int getVisitedUrlNum()
            {
                return visitedUrl.Count;
            }
            //Whether the empty queue URL judgment not visit in the
            public static bool unVisitedUrlsEmpty()
            {
                return unVisitedUrl.isQueueEmpty();
            }
        }


        string[] urlarr=new string[100];
        private void button1_Click(object sender, EventArgs e)
        {
            zzHttp http = new zzHttp();
            CookieContainer cookie = new CookieContainer();
            string url = textBox1.Text!=""?textBox1.Text:"http://image.baidu.com/";
            string content=http.SendDataByGET(url,"",ref cookie);

            string baseUri = Utility.GetBaseUri(url);
            string[] links = Parser.ExtractLinks(baseUri, content);
            foreach (string link in links)
            {
                richTextBox1.Text += link;
                richTextBox1.Text += "\n";
            }


            Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);            // Search string            
            MatchCollection matches = regImg.Matches(content);            

            Queue que = new Queue();
            foreach (Match match in matches)
                que.enQueue(match.Groups["imgUrl"].Value);
            int k;
            for (k = 0; k <que.getcount(); k++)
            {
                string picurl = que.deQueue();
                richTextBox1.Text += picurl;
                richTextBox1.Text += "\n";

                string[] s = picurl.Split('/');
                string picname=s[s.Length - 1];
                zzHttp.downfile(picurl, picname, @"d:\pic\");
            }
            label1.Text = k+"Zhang";
        }

        //Search
        void search()
        {
            int i = 0;
            LinkQueue.addUnvisitedUrl(" ;);
            while (!LinkQueue.unVisitedUrlsEmpty()
            && LinkQueue.getVisitedUrlNum() <= 1000)
            {
                
                //Team a queue head URL
                String visitUrl=(String)LinkQueue.unVisitedUrlDeQueue();
                if(visitUrl==null)
                    continue;
                zzHttp downLoader = new zzHttp();
                CookieContainer cookie = new CookieContainer();
                 //Download Webpage
                string content=downLoader.SendDataByGET(visitUrl,"",ref cookie);
                //The URL in access in URL
                LinkQueue.addVisitedUrl(visitUrl);
                //Extract the downloaded Webpage in URL
                string baseUri = Utility.GetBaseUri(visitUrl);
                string[] links = Parser.ExtractLinks(baseUri, content);
                //New unvisited URL enqueue
                i++;
                Add2Message("Has access number:" + LinkQueue.getVisitedUrlNum() + ",count=" + LinkQueue.getUnVisitedUrl().getcount());
                foreach (string link in links)
                {
                    if (link.Contains("css") || link.Contains("js") || link.Contains("gif") || link.Contains("jpg") || link.Contains("png") || link.Contains("jpeg"))
                        continue;
                    LinkQueue.addUnvisitedUrl(link);
                    AddMessage(link);
                }
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            
          new Thread(search).Start();
        }

        private delegate void InfoDelegate(string message);
        public void AddMessage(string message)
        {
            if (richTextBox1.InvokeRequired)//Cannot access it creates a delegate
            {
                InfoDelegate d = new InfoDelegate(AddMessage);
                richTextBox1.Invoke(d, new object[] { message});
            }
            else
            {
                richTextBox1.AppendText(message + Environment.NewLine);
                richTextBox1.ScrollToCaret();
            }
        }
        private delegate void Info2Delegate(string message);
        public void Add2Message(string message)
        {

            if (label2.InvokeRequired)//Cannot access it creates a delegate
            {
                Info2Delegate d = new Info2Delegate(Add2Message);
                label2.Invoke(d, new object[] { message });
            }
            else
            {
                label2.Text = message;
            }
        }
    }
}

Hiç yorum yok:

Yorum Gönder