#!/bin/bash # Link checker. Edit to suit your needs # Optional proxies. Without these curl will use enviroment vars. #export https_proxy="http://proxy.example.org:8080/" BASE="" # Base URL DIR="/var/local/lib/websearch/" # Directory ERR=0 # Error count HDR="" # Response header HRC=0 # Response code INFIL="index/ext-links.report" # File to be read LC=0 # Number of lines in response header LOC="" # Location: in response header MINLINS=0 # Minumum number of non-blank lines in header OUTHTM="chk-rem-lnk.html" # Generated report RTCNT=0 # Retry count REDIR="" # Redirect URL REPTIT="Broken remote links" # Title of report TMP="/tmp/gen-url-chk-$(date +%s)-${$}" # Temp file COOKFIL="${TMP}-cookies" # Cookie file URL="" # URL to check #USERAGENT="Mozilla/5.0 (X11; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.4.0" USERAGENT="Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" #USERAGENT="Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" WEBPATH="" # Path to file gethrc() { # Get HTTP response code if ( echo "${HDR}" | head -n1 | egrep -q "HTTP/[1-9]([.][0-9])? 200 Connection established" ) then # Squid https connection header HRC=$( echo "${HDR}" | head -n3 | tail -n1 | awk '{print $2}' ) else HRC=$( echo "${HDR}" | head -n1 | awk '{print $2}' ) fi if ! ( echo $HRC | egrep -q '^[0-9]{3}$' ) then # Fake server error HRC=500 fi } getloc() { # Get Location if [[ $HRC -eq 301 || $HRC -eq 302 || $HRC -eq 303 || \ $HRC -eq 307 || $HRC -eq 308 ]] then # Moved LOC=$( echo "${HDR}" | egrep -i '^Location: ' | awk '{print $2}' ) else # Not moved LOC="" fi } chkget() { # For webservers that don't support HEAD # Fetches page, separates HTTP header and finds HTTP response code #echo "GET $1" # -m total timeout # -b Cookie file # -c Cookie file # -A user agent # -H header # -k allow self signed cert # -s silent # -D dump header # - to stdout MINLINS=0 timeout 24 curl --connect-timeout 8 -m 20 -b "${COOKFIL}" -c "${COOKFIL}" \ -A "${USERAGENT}" -H "Accept: */*" -k -s -D - $1 > "${TMP}" 2>&1 if ( echo "${HDR}" | head -n1 | egrep -q "HTTP/[1-9]([.][0-9])? 200 Connection established" ) then # Squid https connection header MINLINS=3 fi # Set the field separator to newline or return IFS=$'\n\r' # Line count LC=0 while read LINE do if [ -z "${LINE}" ] && (( $LC >= $MINLINS )) then break fi let LC+=1 done < "${TMP}" # Separate header from page HDR=$( /bin/cat "${TMP}" | sed -e 's/\r//g' | head -n ${LC} ) # HTTP response code gethrc # Did it move? getloc } chkurl() { # Fetches HTTP header and finds HTTP response code #echo "HEAD $1" # -m total timeout # -A user agent # -H header # -k allow self signed cert # -s silent # -I HEAD HDR=$( timeout 12 curl --connect-timeout 4 -m 10 -A "${USERAGENT}" \ -H "Accept: */*" -k -s -I $1 2>&1 | sed -e 's/\r//g' ) # HTTP response code gethrc # Some servers respond weird to a HEAD if [[ $HRC -ne 200 && $HRC -ne 301 && $HRC -ne 302 && \ $HRC -ne 303 && $HRC -ne 307 && $HRC -ne 308 ]] then # Check using GET instead of HEAD chkget ${1} else # Did it move? getloc fi } fndbase() { # Finds 'method://host' in method://host/dir/file BASE=$( echo $1 | awk -F\/ '{print $1 "//" $3}' ) # Finds 'method://host/dir/' in method://host/dir/file WEBPATH=$( echo $1 | /usr/local/bin/findwebpath ) } # # Main # cd "${DIR}" rm "${OUTHTM}.bak" mv "${OUTHTM}" "${OUTHTM}.bak" > "${OUTHTM}" # # Start HTML report page # echo "
" > "${OUTHTM}"
while read URL
do
# Stuff to skip
if ! ( echo "${URL}" | egrep -q '^http(s)?:' )
then
# Not a http(s) url
continue
fi
if ( echo "${URL}" | egrep -q 'http://validator.w3.org/check/referer' )
then
continue
fi
RTCNT=0
REDIR=""
fndbase "${URL}"
chkurl "${URL}"
# Check for redirects
while [ ! -z "${LOC}" ]
do
# Moved
if ! ( echo "${LOC}" | egrep -q '^(f|ht)tp(s)?://' )
then
# Relative URL
if ( echo "${LOC}" | egrep -q '^/' )
then
LOC="${BASE}${LOC}"
else
LOC="${WEBPATH}${LOC}"
fi
else
# Absolute URL
fndbase "${LOC}"
fi
REDIR="${LOC}"
# Now check again
if [[ RTCNT -le 3 ]]
then
chkurl "${LOC}"
else
# Too many errors; use GET
chkget "${LOC}"
fi
let RTCNT+=1
#echo "$RTCNT Loc: ${LOC}"
if [ $RTCNT -gt 8 ]
then
# Too many retries
echo "Too many retries: ${RTCNT} for ${URL}"
HRC=500
break
fi
done
# Check for cookie frenzy
if [[ $HRC -eq 200 ]] && [ "${REDIR}" == "${URL}" ]
then
REDIR=""
fi
# Check result
if [[ $HRC -ne 200 || ! -z "${REDIR}" ]]
then
# Error
let ERR+=1
printf "%3d " "${ERR}" >> "${OUTHTM}"
echo -n "${HRC} ${URL}" >> "${OUTHTM}"
if [ ! -z "${REDIR}" ]
then
echo -n " Location: ${REDIR}" >> \
"${OUTHTM}"
fi
if ( echo $HRC | egrep -q '5[0-9][0-9]' )
then
# Check hostname
HOSTNAME=$( echo "${URL}" | awk -F\/ '{print $3}' )
echo "" >> "${OUTHTM}"
host "${HOSTNAME}" 2>&1 | sed -e 's/^/\t/g' >> "${OUTHTM}"
fi
echo "" >> "${OUTHTM}"
fi
sleep 1
done < "${INFIL}"
echo "
Errors: ${ERR}
" >> "${OUTHTM}" # Report to stdout lynx -dont_wrap_pre -dump -force_html -nolist "file://${DIR}/${OUTHTM}" # Remove tmp files if [ -f ${TMP} ] then rm ${TMP} fi if [ -f ${COOKFIL} ] then rm ${COOKFIL} fi