#!/bin/bash # Link checker. Edit to suit your needs # Optional proxies. Without these curl will use enviroment vars. #export https_proxy="http://proxy.example.org:8080/" BASE="" # Base URL DIR="/var/local/lib/websearch/" # Directory ERR=0 # Error count HDR="" # Response header HRC=0 # Response code INFIL="index/ext-links.report" # File to be read LC=0 # Number of lines in response header LOC="" # Location: in response header MINLINS=0 # Minumum number of non-blank lines in header OUTHTM="chk-rem-lnk.html" # Generated report RTCNT=0 # Retry count REDIR="" # Redirect URL REPTIT="Broken remote links" # Title of report TMP="/tmp/gen-url-chk-$(date +%s)-${$}" # Temp file COOKFIL="${TMP}-cookies" # Cookie file URL="" # URL to check #USERAGENT="Mozilla/5.0 (X11; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.4.0" USERAGENT="Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" #USERAGENT="Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" WEBPATH="" # Path to file gethrc() { # Get HTTP response code if ( echo "${HDR}" | head -n1 | egrep -q "HTTP/[1-9]([.][0-9])? 200 Connection established" ) then # Squid https connection header HRC=$( echo "${HDR}" | head -n3 | tail -n1 | awk '{print $2}' ) else HRC=$( echo "${HDR}" | head -n1 | awk '{print $2}' ) fi if ! ( echo $HRC | egrep -q '^[0-9]{3}$' ) then # Fake server error HRC=500 fi } getloc() { # Get Location if [[ $HRC -eq 301 || $HRC -eq 302 || $HRC -eq 303 || \ $HRC -eq 307 || $HRC -eq 308 ]] then # Moved LOC=$( echo "${HDR}" | egrep -i '^Location: ' | awk '{print $2}' ) else # Not moved LOC="" fi } chkget() { # For webservers that don't support HEAD # Fetches page, separates HTTP header and finds HTTP response code #echo "GET $1" # -m total timeout # -b Cookie file # -c Cookie file # -A user agent # -H header # -k allow self signed cert # -s silent # -D dump header # - to stdout MINLINS=0 timeout 24 curl --connect-timeout 8 -m 20 -b "${COOKFIL}" -c "${COOKFIL}" \ -A "${USERAGENT}" -H "Accept: */*" -k -s -D - $1 > "${TMP}" 2>&1 if ( echo "${HDR}" | head -n1 | egrep -q "HTTP/[1-9]([.][0-9])? 200 Connection established" ) then # Squid https connection header MINLINS=3 fi # Set the field separator to newline or return IFS=$'\n\r' # Line count LC=0 while read LINE do if [ -z "${LINE}" ] && (( $LC >= $MINLINS )) then break fi let LC+=1 done < "${TMP}" # Separate header from page HDR=$( /bin/cat "${TMP}" | sed -e 's/\r//g' | head -n ${LC} ) # HTTP response code gethrc # Did it move? getloc } chkurl() { # Fetches HTTP header and finds HTTP response code #echo "HEAD $1" # -m total timeout # -A user agent # -H header # -k allow self signed cert # -s silent # -I HEAD HDR=$( timeout 12 curl --connect-timeout 4 -m 10 -A "${USERAGENT}" \ -H "Accept: */*" -k -s -I $1 2>&1 | sed -e 's/\r//g' ) # HTTP response code gethrc # Some servers respond weird to a HEAD if [[ $HRC -ne 200 && $HRC -ne 301 && $HRC -ne 302 && \ $HRC -ne 303 && $HRC -ne 307 && $HRC -ne 308 ]] then # Check using GET instead of HEAD chkget ${1} else # Did it move? getloc fi } fndbase() { # Finds 'method://host' in method://host/dir/file BASE=$( echo $1 | awk -F\/ '{print $1 "//" $3}' ) # Finds 'method://host/dir/' in method://host/dir/file WEBPATH=$( echo $1 | /usr/local/bin/findwebpath ) } # # Main # cd "${DIR}" rm "${OUTHTM}.bak" mv "${OUTHTM}" "${OUTHTM}.bak" > "${OUTHTM}" # # Start HTML report page # echo " Failed ${REPTIT}

Failed ${REPTIT}

" > "${OUTHTM}"

while read URL
do
	#  Stuff to skip
	if ! ( echo "${URL}" | egrep -q '^http(s)?:' )
	then
		# Not a http(s) url
		continue
	fi
	if ( echo "${URL}" | egrep -q 'http://validator.w3.org/check/referer' )
	then
		continue
	fi
	
	RTCNT=0
	REDIR=""
	fndbase "${URL}"
	chkurl "${URL}"
	# Check for redirects
	while [ ! -z "${LOC}" ]
	do
		# Moved
		if ! ( echo "${LOC}" | egrep -q '^(f|ht)tp(s)?://' )
		then
			# Relative URL
			if ( echo "${LOC}" | egrep -q '^/' )
			then
				LOC="${BASE}${LOC}"
			else
				LOC="${WEBPATH}${LOC}"
			fi
		else
			# Absolute URL
			fndbase "${LOC}"
		fi
		REDIR="${LOC}"
		# Now check again
		if [[ RTCNT -le 3 ]]
		then
			chkurl "${LOC}"
		else
			# Too many errors; use GET
			chkget "${LOC}"
		fi
		let RTCNT+=1
		#echo "$RTCNT Loc: ${LOC}"
		if [ $RTCNT -gt 8 ]
		then
			# Too many retries
			echo "Too many retries: ${RTCNT} for ${URL}"
			HRC=500
			break
		fi
	done
	# Check for cookie frenzy
	if [[ $HRC -eq 200 ]] && [ "${REDIR}" == "${URL}" ]
	then
		REDIR=""
	fi
	# Check result
	if [[ $HRC -ne 200 || ! -z "${REDIR}" ]]
	then
		# Error
		let ERR+=1
		printf "%3d " "${ERR}" >> "${OUTHTM}"
		echo -n "${HRC} ${URL}" >> "${OUTHTM}"
		if [ ! -z "${REDIR}" ]
		then
			echo -n " Location: ${REDIR}" >> \
			    "${OUTHTM}"
		fi
		if ( echo $HRC | egrep -q '5[0-9][0-9]' )
		then
			# Check hostname
			HOSTNAME=$( echo "${URL}" | awk -F\/ '{print $3}' )
			echo "" >> "${OUTHTM}"
			host "${HOSTNAME}" 2>&1 | sed -e 's/^/\t/g' >> "${OUTHTM}"
		fi
		echo "" >> "${OUTHTM}"
	fi
	sleep 1
done < "${INFIL}"

echo "

Errors: ${ERR}

" >> "${OUTHTM}" # Report to stdout lynx -dont_wrap_pre -dump -force_html -nolist "file://${DIR}/${OUTHTM}" # Remove tmp files if [ -f ${TMP} ] then rm ${TMP} fi if [ -f ${COOKFIL} ] then rm ${COOKFIL} fi