#!/bin/bash # Link checker. Edit to suit your needs # Optional proxies. Without these curl will use enviroment vars. #export https_proxy="http://proxy.example.org:8080/" BASE="" # Base URL DIR="/var/local/lib/websearch/" # Directory ERR=0 # Error count HDR="" # Response header HRC=0 # Response code INFIL="index/ext-links.report" # File to be read LC=0 # Number of lines in response header LOC="" # Location: in response header MINLINS=0 # Minumum number of non-blank lines in header OUTHTM="chk-rem-lnk.html" # Generated report RTCNT=0 # Retry count REDIR="" # Redirect URL REPTIT="Broken remote links" # Title of report TMP="/tmp/gen-url-chk-$(date +%s)-${$}" # Temp file COOKFIL="${TMP}-cookies" # Cookie file URL="" # URL to check #USERAGENT="Mozilla/5.0 (X11; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.4.0" USERAGENT="Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" #USERAGENT="Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" WEBPATH="" # Path to file gethrc() { # Get HTTP response code if ( echo "${HDR}" | head -n1 | egrep -q "HTTP/[1-9]([.][0-9])? 200 Connection established" ) then # Squid https connection header HRC=$( echo "${HDR}" | head -n3 | tail -n1 | awk '{print $2}' ) else HRC=$( echo "${HDR}" | head -n1 | awk '{print $2}' ) fi if ! ( echo $HRC | egrep -q '^[0-9]{3}$' ) then # Fake server error HRC=500 fi } getloc() { # Get Location if [[ $HRC -eq 301 || $HRC -eq 302 || $HRC -eq 303 || \ $HRC -eq 307 || $HRC -eq 308 ]] then # Moved LOC=$( echo "${HDR}" | egrep -i '^Location: ' | awk '{print $2}' ) else # Not moved LOC="" fi } chkget() { # For webservers that don't support HEAD # Fetches page, separates HTTP header and finds HTTP response code #echo "GET $1" # -m total timeout # -b Cookie file # -c Cookie file # -A user agent # -H header # -k allow self signed cert # -s silent # -D dump header # - to stdout MINLINS=0 timeout 24 curl --connect-timeout 8 -m 20 -b "${COOKFIL}" -c "${COOKFIL}" \ -A "${USERAGENT}" -H "Accept: */*" -k -s -D - $1 > "${TMP}" 2>&1 if ( echo "${HDR}" | head -n1 | egrep -q "HTTP/[1-9]([.][0-9])? 200 Connection established" ) then # Squid https connection header MINLINS=3 fi # Set the field separator to newline or return IFS=$'\n\r' # Line count LC=0 while read LINE do if [ -z "${LINE}" ] && (( $LC >= $MINLINS )) then break fi let LC+=1 done < "${TMP}" # Separate header from page HDR=$( /bin/cat "${TMP}" | sed -e 's/\r//g' | head -n ${LC} ) # HTTP response code gethrc # Did it move? getloc } chkurl() { # Fetches HTTP header and finds HTTP response code #echo "HEAD $1" # -m total timeout # -A user agent # -H header # -k allow self signed cert # -s silent # -I HEAD HDR=$( timeout 12 curl --connect-timeout 4 -m 10 -A "${USERAGENT}" \ -H "Accept: */*" -k -s -I $1 2>&1 | sed -e 's/\r//g' ) # HTTP response code gethrc # Some servers respond weird to a HEAD if [[ $HRC -ne 200 && $HRC -ne 301 && $HRC -ne 302 && \ $HRC -ne 303 && $HRC -ne 307 && $HRC -ne 308 ]] then # Check using GET instead of HEAD chkget ${1} else # Did it move? getloc fi } fndbase() { # Finds 'method://host' in method://host/dir/file BASE=$( echo $1 | awk -F\/ '{print $1 "//" $3}' ) # Finds 'method://host/dir/' in method://host/dir/file WEBPATH=$( echo $1 | /usr/local/bin/findwebpath ) } # # Main # cd "${DIR}" rm "${OUTHTM}.bak" mv "${OUTHTM}" "${OUTHTM}.bak" > "${OUTHTM}" # # Start HTML report page # echo "
" > "${OUTHTM}" while read URL do # Stuff to skip if ! ( echo "${URL}" | egrep -q '^http(s)?:' ) then # Not a http(s) url continue fi if ( echo "${URL}" | egrep -q 'http://validator.w3.org/check/referer' ) then continue fi RTCNT=0 REDIR="" fndbase "${URL}" chkurl "${URL}" # Check for redirects while [ ! -z "${LOC}" ] do # Moved if ! ( echo "${LOC}" | egrep -q '^(f|ht)tp(s)?://' ) then # Relative URL if ( echo "${LOC}" | egrep -q '^/' ) then LOC="${BASE}${LOC}" else LOC="${WEBPATH}${LOC}" fi else # Absolute URL fndbase "${LOC}" fi REDIR="${LOC}" # Now check again if [[ RTCNT -le 3 ]] then chkurl "${LOC}" else # Too many errors; use GET chkget "${LOC}" fi let RTCNT+=1 #echo "$RTCNT Loc: ${LOC}" if [ $RTCNT -gt 8 ] then # Too many retries echo "Too many retries: ${RTCNT} for ${URL}" HRC=500 break fi done # Check for cookie frenzy if [[ $HRC -eq 200 ]] && [ "${REDIR}" == "${URL}" ] then REDIR="" fi # Check result if [[ $HRC -ne 200 || ! -z "${REDIR}" ]] then # Error let ERR+=1 printf "%3d " "${ERR}" >> "${OUTHTM}" echo -n "${HRC} ${URL}" >> "${OUTHTM}" if [ ! -z "${REDIR}" ] then echo -n " Location: ${REDIR}" >> \ "${OUTHTM}" fi if ( echo $HRC | egrep -q '5[0-9][0-9]' ) then # Check hostname HOSTNAME=$( echo "${URL}" | awk -F\/ '{print $3}' ) echo "" >> "${OUTHTM}" host "${HOSTNAME}" 2>&1 | sed -e 's/^/\t/g' >> "${OUTHTM}" fi echo "" >> "${OUTHTM}" fi sleep 1 done < "${INFIL}" echo "
Errors: ${ERR}
" >> "${OUTHTM}" # Report to stdout lynx -dont_wrap_pre -dump -force_html -nolist "file://${DIR}/${OUTHTM}" # Remove tmp files if [ -f ${TMP} ] then rm ${TMP} fi if [ -f ${COOKFIL} ] then rm ${COOKFIL} fi