Friday, July 30, 2010

A browser crawler : the code

configuration class allows some configuration variables and methods

public class Configuration
    {
        public int sequenceLen = 10000;
        public int bigPauseSeconds = 9;
        public int littlePauseSeconds = 2;
        public int frequencyBigPauseMax = 21;
        public int frequencyBigPauseMin = 5;
        public int deepBrowsing = 3; //
        public List<string> browserIdentities = new List<string>();
        public Configuration()
        {
            LoadBrowserIdenties();
        }

        //only for webclient class usage
        private void LoadBrowserIdenties()
        {
            //add here "user-agent" http values
            browserIdentities.Add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
        }

    }
LoadBrowserIdentities allows loading of "user-agent" var for multiple browsers, at this time one :-) ,

Util class do ...utility class and contains methods helper for entire application

 public class Util
    {

        public Util()
        {

        }
        
        public string HtmlEncode(string input)
        {
            return HttpUtility.UrlEncode(input);            
        }
        public List<int> CreatePauseSequence()
        {
            Configuration c = new Configuration();
            List<int> sequencePauseList = new List<int>();
            Random r = new Random();
            int bigPauseRndFrequency = r.Next(c.frequencyBigPauseMin, c.frequencyBigPauseMax);
            int start = 0;
            int end = c.sequenceLen;
            while (start < end)
            {
                if ((start % bigPauseRndFrequency) == 0)
                {
                    sequencePauseList.Add(this.BigPauseCalculator());
                }
                else
                {
                    sequencePauseList.Add(this.LittlePauseCalculator());
                }
                start++;
            }
            return sequencePauseList;
        }
        private int BigPauseCalculator()
        {
            Configuration c = new Configuration();
            int initial = c.bigPauseSeconds;
            Random r = new Random();
            int randRet = r.Next(10);
            int final = initial * randRet;
            return final;
        }
        private int LittlePauseCalculator()
        {
            int finalLittle = 0;
            Configuration c = new Configuration();
            Random r = new Random();
            int rndRet = r.Next(10);
            int l = c.littlePauseSeconds;
            finalLittle = rndRet * l;
            return finalLittle;
        }
    }
class NodeResource is a node of navigation process ,it extends Node class

public class Node
    {
        public Node()
        {
        }
    }

    public class NodeResource:Node
    {
        public int deepNode = 0;
        public string uri = null;
        public bool visited = false;
        public HtmlDocument currDocument = null;
        public string strCurrDocument = null;
        public List<KeywordsResult> listK = null;
        public List<RegExResult> listR = null;
        public byte[] binaryResource;
        public NodeResource()
        {
        }
    }

class Navigation is navigation process

public class Navigation
    {
        public List<string> keywords = new List<string>();
        public List<string> regularExpression = new List<string>();
        public List<NodeResource> listNodes = new List<NodeResource>();
        public int deepNodeLimit = 0;
        public Navigation()
        {
            Configuration c = new Configuration();
            this.deepNodeLimit = c.deepBrowsing;
        }
    }

class RegExResult store result of regular expressions extracted from uri data

public class RegExResult
    {
        public string regExtext = null;
        public List<string> matchList = null;
        public RegExResult()
        {
        }
    }

How it works :
here codebehind from a windows form
for first is called Load method when form is loaded ,
it create a timer but disabled ,when button is clicked on interface of windows form
navigation class is instanced and timer property "enabled" is setted to true value, timer run , when it elapses
for first enable value of timer is setted to "false" , after setted navigate methods of webbrowser control is called,when downloadcompleted event from webbrrowser control is elapsed noderesource object have an embedded htmldocument ,all works with enabling disablign timer,I have choosed this solution for emulating a human user in this software and for this it hava e DoSleep() function ,it stop current thread for a pause ,pause is short or long and values are random from a min and a max ,
iinto Configuration class you have a deep value it is max deep of exploration ,


public partial class Form1 : Form
    {
        List<int> calculatedSleepInterval = null;
        List<HtmlElement> links = new List<HtmlElement>();
        List<HtmlDocument> listHtmlDocs = new List<HtmlDocument>();
        public Navigation nav = null;
        System.Timers.Timer timer = null;
        NodeResource currNodeResource = null;
        public Form1()
        {
            InitializeComponent();          
        }

        private void button6_Click(object sender, EventArgs e)
        {
            //add here result speech processor...
            //add <br> and "." parse consideration introducing pauses during reading process

        }

        private void Form1_Load(object sender, EventArgs e)
        {
            TimerCreation();
            //EmailManager email= new EmailManager();
            CalculateSequencePause();
            Util u = new Util();
            this.webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);                     
        }

        private void TimerCreation()
        {
            timer = new System.Timers.Timer();
            timer.AutoReset = true;
            timer.Enabled = false;
            timer.Interval = 2000;
            timer.Elapsed += new ElapsedEventHandler(timer_Elapsed);
        }

        /// <summary>
        /// timer do all works , 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        void timer_Elapsed(object sender, ElapsedEventArgs e)
        {           
            //go to first unknown uri
            int counted = this.nav.listNodes.Count;
            int counter = 0;
            bool found=false;
            //found unvisited resource
            while (counter < counted && !found)
            {
                NodeResource nr = this.nav.listNodes[counter];
                if (!nr.visited)
                {
                    //here is unique point for calling Navigate() method
                    this.currNodeResource = nr;
                    //stop timer
                    this.timer.Enabled = false;
                    found = true;
                    this.webBrowser1.Navigate(new Uri(nr.uri));            
                }
                counter++;
            } 
            //if found ==false exit()
        }

        private void CalculateSequencePause()
        {
            Util u = new Util();
            List<int> sequencePause = u.CreatePauseSequence();
            this.calculatedSleepInterval = sequencePause;
        }

       
        void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {           
            WebBrowser w = (WebBrowser)sender;                        
            //set currNode
            currNodeResource.uri = w.Url.ToString();
            currNodeResource.currDocument = w.Document;
            currNodeResource.visited = true;
            currNodeResource.strCurrDocument = w.DocumentText;

            //replace currNode in list nodes
            int counted=this.nav.listNodes.Count;
            int counter=0;
            bool found = false;
            while(counter < counted && !found) {
                NodeResource res = this.nav.listNodes[counter];  

                if(res.uri.Equals(w.Url.ToString())) {
                    //replace NodeResource
                    this.nav.listNodes[counter]=currNodeResource;
                }
            }
            //here get sub nodes
            if (currNodeResource.deepNode < nav.deepNodeLimit)
            {

                //links to  resources
                counted = w.Document.Links.Count;
                counter = 0;
                while (counter < counted)
                {
                    //create and append a node for each new uri
                    HtmlElement elem = w.Document.Links[counter];                                     
                    NodeResource n = new NodeResource();
                    n.deepNode = currNodeResource.deepNode + 1;
                    n.uri = elem.GetAttribute("href");
                    this.nav.listNodes.Add(n);
                    counter++;
                }
            }
            //here re-enable timer object
            DoSleep();
            this.timer.Enabled = true;           
        }

        private void button8_Click(object sender, EventArgs e)
        {
            //start navigation
            if (this.nav == null)
            {               
                Navigation n = new Navigation();               
            }
            else
            {
                // clear previous navigation values
                // this.nav.listNodes.Clear();
            }
            //single node creation
            NodeResource node = new NodeResource();
            node.uri = this.textBoxUrl.Text;
            node.deepNode = 1;
            this.currNodeResource = node;
            //append node in navigation
            this.nav.listNodes.Add(node);
            this.timer.Enabled = true;      
        }

        private void DoSleep()
        {
            if (this.calculatedSleepInterval.Count < 1)
            {
                this.CalculateSequencePause();
            }
            int pause = this.calculatedSleepInterval[0];
            Thread.Sleep(pause * 1000);
            this.calculatedSleepInterval.RemoveAt(0);
        }
       
    }
application will provide for reading result using speech synthesizer ,
at this time I search a stand alone synonyms dictionary for an addtion of correlation to basic, plain ,search of keywords.I love multithreading tecniques but this application will emulate a human user and he have a single conscious process ..or not? ,
code is not tested and not compiled...

0 commenti:

Post a Comment