#!/bin/bash # linux urllist.txt sitemap generator # run the spider again for deeper URLs # written by Joachim De Zutter # released under GPL site="www.example.com" startpage="http://www.example.com/" lynx -dump $startpage | grep -i "$site" | sed -e "s/.*\(http:\/\/.*$site\/[^\"\'<>#\&]*\).*/\1/g" >>urllist.txt while read a do echo $a sort -u urllist.txt | grep -v "\.\.\." >urllist.tmp mv urllist.tmp urllist.txt lynx -dump $a | grep -i "$site" | sed -e "s/.*\(http:\/\/.*$site\/[^\"\'<>#\&]*\).*/\1/g" >>urllist.txt done <urllist.txt
January 22, 2012
bash shell urllist crawler
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment