# Please go to http://pupwebhost.com/robots/use.html # Google Search engine User-agent: Googlebot Disallow: /cgi-bin/ Disallow: /images/ User-agent: Googlebot-mobile Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # MSN Search engine User-Agent: MSNBot Disallow: /cgi-bin/ Disallow: /images/ # Yahoo Search engine User-agent: YahooSeeker Disallow: /cgi-bin/ Disallow: /images/ # AltaVista search engine agent # http://www.altavista.com/ User-Agent: Scooter Disallow: /cgi-bin/ Disallow: /images/ # AOL and Excite Search engine User-Agent: ArchitextSpider Disallow: /cgi-bin/ Disallow: /images/ # Indexing statistics # http://backrub.stanford.edu/ User-agent: BackRub/*.* Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Macintosh-hosted link validation tool. # http://pauillac.inria.fr/~fpottier/mac-soft.html.en User-agent: Big Brother Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Research the growth of specific sites. # http://140.190.65.12/~khooghee/index.html User-agent: BlackWidow Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ #Locates chemical structures in Chemical MIME formats on WWW and FTP servers and downloads them into database searchable with structure queries (substructure, fullstructure, formula, properties etc.) # http://schiele.organik.uni-erlangen.de/cactvs/spider.html User-agent: CACTVS Chemistry Spider Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Checkbot checks links in a given set of pages on one or more servers. It reports links which returned an error code # http://www.xs4all.nl/~graaff/checkbot/ User-agent: Checkbot/x.xx LWP/5.x Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Generate a Resource Discovery database, perform mirroring, and generate statistics. Uses combination of Informix(tm) Database and WN 1.11 serversoftware for indexing/ressource discovery, fulltext search, text excerpts. # http://deweb.orbit.de/ User-agent: Deweb/1.01 Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Indexing bot # http://www.planetsearch.com/ User-agent: fido/0.9 Harvest/1.4.pl2 Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Generates a Resource Discovery database. Collects WWW pages for both InfoSeek's free WWW search and commercial search. Very fast, but never has more than one request per site outstanding at any given time. . # http://www.infoseek.com User-agent: InfoSeek Robot 1.0 Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ #Collects WWW pages for both InfoSeek's free WWW search services. # http://www.infoseek.com/ User-agent: Infoseek Sidewinder Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # AskJeeves search engine # http://www.ask.com User-agent: AskJeeves Disallow: /cgi-bin/ Disallow: /images/ # Its purpose is to generate a Resource Discovery database. Intended to seek out sites of potential "career interest". Hence - Job Robot. # http://www.micrognosis.com/~ajack/jobot/jobot.html User-agent: Jobot/0.1alpha libwww-perl/4.0 Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # The KO_Yappo_Robot robot is used to build the database for the Yappo search service by k,osawa (part of AOL). The robot runs random day, and visits sites in a random order. # http://yappo.com/info/robot.html User-agent: KO_Yappo_Robot/1.0.4(http://yappo.com/info/robot.html) Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # This is a research program in providing information retrieval and discovery in the WWW, using a finite memory model of the web to guide intelligent, directed searches for specific information needs # http://lycos.cs.cmu.edu/ User-agent: Lycos/x.x Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Indexing # http://www.cs.colostate.edu/~sonnen/projects/nomad.html User-agent: Nomad-V2.x Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Finds URLs for K-12 content management. # http://www.urlabs.com/ User-agent: SafetyNet Robot 0.1, Disallow: /cgi-bin/ Disallow: /images/ Disallow: /_*.$/ # Search engine # http://webcrawler.com User-agent: WebCrawler/3.0 Robot libwww/5.0a Disallow: /cgi-bin/ Disallow: /images/ # Search engine # http://webcrawler.com User-agent: WebCrawler-AddURL/2.0 Disallow: /cgi-bin/ Disallow: /images/ # This will send robot not listed below to the empty folder norobots # with Disallow: * robots are able to return you directory tree # This information could aid would be hackers to you files User-Agent: * Disallow: /norobots/