Disclaimer: This script is for personal use only. Commercial use is a breach of CraigsList's TOS agreement
#! /bin/bash
#change the relativeURL to gather data accross all craigslist domains
relativeURL="/search/cta?autoMakeModel=ford+escape&query=ford+escape"
rm temp 2> /dev/null
touch temp
getcityURLs(){$
normalizeURLs(){
while read postURL; do
if [[ $postURL != "http"* ]] ;
then
postURL=${@: -1}$postURL;
fi
echo $postURL >> temp;
done
}
#the maximum number of posts you want to collect from each domain is the value of results + 100
results=200
while [ $results -gt -1 ]
do
url="$1$relativeURL&s=$results"
echo $url
wget -q -O- $url |
grep -E 'data-id' $postURL |
grep -Po '(?<=href=")[^"]*' | normalizeURLs $1
results=$(( $results - 100 ))
done
}
export relativeURL
export -f getcityURLs
cat baseURLs | parallel -j 20 --gnu getcityURLs
echo "removing duplicate listings"
awk '!a[$0]++' temp > listingURLs
This will dump all of the post URLs into a single file. The next step is to extract all the data you want from those posts(price, location, odometer, condition....) and put it into a CSV file.
To do this I used the lynx and html-xml-utils packages.
parsePosting()
{
posting="$(wget -q -O- $1)"
echo $1
price="$(echo $posting |
hxnormalize -x |
hxselect -i -c '.price' |
lynx -stdin -dump -width 9000 -nomargins)"
details="$(echo $posting |
hxnormalize -x |
hxselect -i -c ".attrgroup" |
hxselect -i -s ',' 'span' |
hxselect -i -s ',' 'b' |
lynx -stdin -dump -width 9000 -nomargins)"
city="$(echo $1 | grep -o -P '(?<=http://).*(?=.craig)')"
echo "$city,$price,$1,$details" >> CSVData
}
rm CSVData 2> /dev/null
touch CSVData
export -f parsePosting
cat listingURLs | parallel -j 20 --gnu parsePosting
Because this script will be making thousands of wget calls it is probably a good idea to have a fast internet connection on the machine you run this on. I use a Digital Ocean VPS which works pretty great. You can also adjust the parallel -j flag to increase/decrease the number of concurrent jobs