# disallow, too intensive http://www.authoritativeweb.com/crawl User-agent: ConveraCrawler Disallow: / # ### end ConveraCrawler # disallow, access too fast, www.nameprotect.com/botinfo.html User-agent: BPBot Disallow: / # ### end BPBot # disallow, access too fast, www.nameprotect.com/botinfo.html User-agent: NPBot Disallow: / # ### end NPBot # disallow, access too fast, http://www.openfind.com.tw/robot.html User-agent: Openbot Disallow: / # ### end Openbot # disallow, access too fast, http://dir.com/pompos.html User-agent: pompos Disallow: / # ### end pompos # disallow, access too fast, http://www.aipbot.com User-agent: aipbot Disallow: / # ### end aipbot # disallow, access too fast, http:// User-agent: BlitzBOT@tricus.net Disallow: / # ### end BlitzBOT # disallow, for now 5/9/07 User-agent: Exabot Disallow: / # ### end Exabot # slow down Teoma (ask jeeves) User-agent: Teoma Crawl-delay: 1 # # Disallow these pages, contain many links to search_engine - resource intensive Disallow: /commercial/resources/categories.shtml Disallow: /commercial/resources/publishers.shtml Disallow: /commercial/resources/titles.shtml # # Disallow script directories (except free-scripts) Disallow: /admin_tools/ Disallow: /cgi-scripts/ Disallow: /eiu-ipadmin/ Disallow: /ip-scripts/ Disallow: /ppv-scripts/ Disallow: /secure-scripts/ Disallow: /toolbox/ Disallow: /utilities/ # # Disallow browse pages, resource intensive Disallow: /coms2/industrybrowse Disallow: /coms2/browsesummary Disallow: /coms2/industryalphalist # # Disallow sample pages Disallow: /manta/sample-pdf # Disallow: /coms2/dnbdescription # # Block Custom Search Engine page Disallow: /coms2/page_cse # ### end Teoma # allow AdSense to access free-scripts for advertising links related to content User-agent: Mediapartners-Google* Disallow: /aboutecnext/seminar/2002/ # # Disallow these pages, contain many links to search_engine - resource intensive Disallow: /commercial/resources/categories.shtml Disallow: /commercial/resources/publishers.shtml Disallow: /commercial/resources/titles.shtml # # Disallow script directories (except free-scripts) Disallow: /admin_tools/ Disallow: /cgi-scripts/ Disallow: /eiu-ipadmin/ Disallow: /ip-scripts/ Disallow: /ppv-scripts/ Disallow: /secure-scripts/ Disallow: /toolbox/ Disallow: /utilities/ # # Disallow browse pages, resource intensive Disallow: /coms2/industrybrowse Disallow: /coms2/browsesummary Disallow: /coms2/industryalphalist # # Disallow sample pages Disallow: /manta/sample-pdf # Disallow: /coms2/dnbdescription # # Block Custom Search Engine page Disallow: /coms2/page_cse # ### end Mediapartners-Google # allow Yahoo to access free-scripts for advertising links related to content ##User-agent: YahooYSMcm/2.0.0 ##Disallow: /aboutecnext/seminar/2002/ # # Disallow these pages, contain many links to search_engine - resource intensive ##Disallow: /commercial/resources/categories.shtml ##Disallow: /commercial/resources/publishers.shtml ##Disallow: /commercial/resources/titles.shtml # # Disallow script directories (except free-scripts) ##Disallow: /admin_tools/ ##Disallow: /cgi-scripts/ ##Disallow: /eiu-ipadmin/ ##Disallow: /ip-scripts/ ##Disallow: /ppv-scripts/ ##Disallow: /secure-scripts/ ##Disallow: /toolbox/ ##Disallow: /utilities/ # ### end Yahoo # PageBull.com - same as defaults, needs own stanza, won't honor User-agent: * User-agent: pagebullbot # forbid seminar Disallow: /aboutecnext/seminar/2002/ # # Disallow these pages, contain many links to search_engine - resource intensive Disallow: /commercial/resources/categories.shtml Disallow: /commercial/resources/publishers.shtml Disallow: /commercial/resources/titles.shtml # # Disallow script directories (except free-scripts) Disallow: /free-scripts/ Disallow: /admin_tools/ Disallow: /cgi-scripts/ Disallow: /eiu-ipadmin/ Disallow: /ip-scripts/ Disallow: /ppv-scripts/ Disallow: /secure-scripts/ Disallow: /toolbox/ Disallow: /utilities/ Disallow: /comsite5/bin/ # # Disallow browse pages, resource intensive Disallow: /coms2/industrybrowse Disallow: /coms2/browsesummary Disallow: /coms2/industryalphalist # # Disallow sample pages Disallow: /manta/sample-pdf # Disallow: /coms2/dnbdescription # # Block Custom Search Engine page Disallow: /coms2/page_cse # ## end PageBull # slurp User-agent: slurp # forbid company pages with trailing / Disallow: /coms2/dnbcompany_*/ # ## end slurp # Defaults for rest of crawlers User-agent: * # forbid seminar Disallow: /aboutecnext/seminar/2002/ # # Disallow these pages, contain many links to search_engine - resource intensive Disallow: /commercial/resources/categories.shtml Disallow: /commercial/resources/publishers.shtml Disallow: /commercial/resources/titles.shtml # # Disallow script directories Disallow: /free-scripts/ Disallow: /admin_tools/ Disallow: /cgi-scripts/ Disallow: /eiu-ipadmin/ Disallow: /ip-scripts/ Disallow: /ppv-scripts/ Disallow: /secure-scripts/ Disallow: /toolbox/ Disallow: /utilities/ Disallow: /comsite5/bin/ # # Disallow browse pages, resource intensive Disallow: /coms2/industrybrowse Disallow: /coms2/browsesummary Disallow: /coms2/industryalphalist # # Exclude syndicated content: jobs and Yahoo answers (on Manta) Disallow: /jobs Disallow: /answers # # Disallow sample pages Disallow: /manta/sample-pdf # Disallow: /coms2/dnbdescription # #disallow US-megabrowse government pages simply to allow #more timely crawling of other megabrowse pages Disallow: /mb_*_F3*_*/ Allow: /mb_5*_*_F3*/ Disallow: /mb_5*_F3*_F3*/ # #disallow most agriculture pages Disallow: /mb_*_C0*_*/ Allow: /mb_*_C02E5*_*/ Allow: /mb_*_C02E6*_*/ Allow: /mb_*_C030D*_*/ Allow: /mb_*_C030E*_*/ Allow: /mb_*_C0_*/ # # Disallow CBI pages to allow faster crawling of company pages Disallow: /coms2/companycbi # # Block Custom Search Engine page Disallow: /coms2/page_cse Disallow: /video Disallow: /coms2/page_radlink_results #Block company profile tabs Disallow: /company/*tab= Disallow: /coms2/dnbcompany_*tab= # Block broken comsite page links Disallow: /?page= # Block user profiles Disallow: /people/ Disallow: /profile Disallow: /member/ ## end default