#!/bin/bash
# Script to automatically Generate an URL List for use with Yahoo!
# Output file is also suitable for use with Google Sitemap Generator and HTML_sitemap_gen
# Copyright Ben Tasker 2009
# Released under the GNU GPL
# See the LICENSE file or http://benscomputer.no-ip.org/LICENSE for more details

# Set your variables here

# Set the address of the website you wish to crawl, use a lowercase http

WEBADDRESS="http://targetserver.com"

# Where do you want the script to put the urllist?
URLLISTLOCAT="/path/to/urllist.txt"

# Where do you want the Temporary files to be stored? (Wget doesn't like /tmp on my system for some reason)
TMPLOCAT="/tmp"


# Code starts

# Prepare the environment and make sure things are cleaned up

rm urllist

# Crawl the site and find all URLS that are available
wget -l 0 -U "Url_list_gen" -r --delete-after -o urllist $WEBADDRESS

# The output is currently packed with stuff we don't need, lets grep it out

cat urllist | grep "http" > urllist2
cat urllist2 | grep -v "Location" > urllist
cat urllist | grep -v "response" > urllist2
cat urllist2

# this will give a file full of lines in the format
# --TIMESTAMP -- http://address/page
# So we need to get rid of the timestamp and any excess spaces
# Would be nice if we could do it character by character but people do use - in web addresses (numbers too)

echo "" > urllist
for i in $( cat urllist2 )
do

# This command seems to identify the double space between the timestamp and the address as a new record
# So lets take advantage and seperate them onto different lines
echo $i >> urllist

done

# Now lets get rid of the timestamps
cat urllist | grep "http" > urllist2

# The list contains any page on our site that we have linked to, but if a link is broken then we don't really want
# to make things worse by putting it into sitemaps, lets run wget on each one and check the status
rm urllist
for i in $( cat urllist2 )
do
wget -U "Url_list_gen" --spider $i
if [ "$?" == "0" ]
then
# Wget reported no errors, should mean a HTTP 200 Status
echo "$i" >> urllist
fi
done

# Finally we just need to remove a couple of things we don't really want Indexed by Yahoo etc.
cat urllist | grep -iv "robots.txt" > urllist2
cat urllist2 | grep -iv "[.]ico" > urllist

# Tidy up
rm urllist2

#Move the URLLIST to its final location
mv urllist $URLLISTLOCAT


