January 22, 2012

bash shell urllist crawler

#!/bin/bash
# linux urllist.txt sitemap generator
# run the spider again for deeper URLs
# written by Joachim De Zutter
# released under GPL
site="www.example.com"
startpage="http://www.example.com/"
lynx -dump $startpage | grep -i "$site" | sed -e "s/.*\(http:\/\/.*$site\/[^\"\'<>#\&]*\).*/\1/g" >>urllist.txt
while read a
do
  echo $a
  sort -u urllist.txt | grep -v "\.\.\." >urllist.tmp
  mv urllist.tmp urllist.txt
  lynx -dump $a | grep -i "$site" | sed -e "s/.*\(http:\/\/.*$site\/[^\"\'<>#\&]*\).*/\1/g" >>urllist.txt
done <urllist.txt

No comments: