OpenOffice, PDF, Microsoft Office and generic documents linear search script

Another problem, another script. This time, it’s a script to perform linear search through various documents, without having to install something heavyweight like Google Desktop for Linux.

Why did I write this script you say? Well, let’s just say it drove me nuts to be staring at a directory full of OpenOffice / PDF / MS Office files that you know grep can’t do its thing on.

Here is the script (please feel free to improve it!):

#!/bin/bash

# Script to look inside each file specified and try and match on the given string

set -u

if (($#<2)); then
	# e.g. finddocs -iE "someText" *
	echo -n 
'Usage:
	finddocs [-(grep_flgas)] (SearchStr|RegExp) file1 [file2] [file3]...

Examples:
	find -iname "*some_partial_name*" -print0|xargs -0 finddocs -i "some text"
	find -iname "*.txt" -print0|xargs -0 finddocs -iE "xx[0-5]"
	find -print0|xargs -0 finddocs -i "kerberos"
'
	exit 1
fi

rtr=""

usrGrepFlags=""
searchStr=""
mimeType=""

# Get grep flags
if [ "${1:0:1}" == "-" ]; then
	usrGrepFlags="${1:1}"	# Append just the flags
	# Strip all non-compatible options - we are appending the options into grep!
	usrGrepFlags="$(echo "$usrGrepFlags"|sed -e s/[^iEPv]//g)"	# use "v" with caution!
	shift
fi

# Get search string
searchStr="$1"
shift


## Escape the HTML of the search string - only applicable to documents that stores its data as XML (e.g. OpenOffice, MS Office 2007+)
# $1: input string
# return: string - escaped string
function escapeHTML() {
	local searchStr="$1"
	
	local search=(	"&"			"<"		">")	# **** Please consider the ordering due to sed replace loop below, and don't forget to escape! ****
	local replace=(	"&amp;"	"&lt;"	"&gt;")
	
	local i;
	for ((i=0; i<${#search[@]}; i++)); do
		searchStr=$(echo "$searchStr"|sed -e "s/${search[$i]}/${replace[$i]}/g")
	done
	rtr="$searchStr"
}

## Check to see if group exist
# $1: filename
# return: int - count of string occurrences
function findtxt () {
	rtr=$(grep -c"$usrGrepFlags" "$searchStr" "$1")
	# For documents with unicode characters
	if (( $rtr == 0 )); then	# This takes longer to search - only do if we can't find anything!
		rtr=$(sed -s "s/x00//g" < "$1"|grep -c"$usrGrepFlags" "$searchStr")
	fi
}

## Grep for file within a zip container - OpenOffice / MS Office 2007+
# $1: filename
# $2: path of file with content inside the zip file
# return: int - count of string occurrences
function zipGrep () {
	# Try unzipping the document's content file and see if the string exist
	# e.g.: #count=$(unzip -pa "$i" content.xml 2>/dev/null|grep -c"$usrGrepFlags" "$searchStr")
	escapeHTML "$searchStr"
	local escapedSearchStr="$rtr"
	count=$(unzip -pa "$1" "$2" 2>/dev/null|grep -c"$usrGrepFlags" "$escapedSearchStr")
	rtr=$count
}

# Search using the rest of the file names
for i in "$@"; do
	# If it's a file
	if [ -f "$i" ]; then
		count=0
		mimeType="$(file -bL0 "$i")"	# Note I'm not using the --mime-type / -i - OpenOffice documents shows up as "application/octet-stream"...
		case "$mimeType" in
			"PDF"*)
				count=$(pdftotext -q "$i" - 2>/dev/null|grep -c"$usrGrepFlags" "$searchStr")
			;;
			"OpenDocument"*)
				zipGrep "$i" content.xml
				count=$rtr
			;;
			"ASCII"*)
				findtxt "$i"
				count=$rtr
			;;
			"Bourne"*)
				findtxt "$i"
				count=$rtr
			;;
			# Some MS document gets reported as CDF...
			"CDF"*)
				findtxt "$i"
				count=$rtr
			;;
			"Microsoft"*)
				findtxt "$i"
				count=$rtr
			;;
			"UTF-8"*)
				findtxt "$i"
				count=$rtr
			;;
			"XML"* | "HTML"*)
				findtxt "$i"
				count=$rtr
			;;
			# Office 2007 formats
			"Zip"*)
				fileext=$(basename "$i"|grep -ioE ".([a-z]{4,})$"|cut -c 2-|tr "[:upper:]" "[:lower:]")
				# If the extension ends in
				case "$fileext" in
					"docx"*)
						zipGrep "$i" "word/document.xml"
						count=$rtr
					;;
					"pptx"*)
						# pptx contents are in /ppt/slides/slide*.xml
						for slide in $(zipinfo -1 "$i"|grep "ppt/slides/slide[0-9]*.xml"); do
							zipGrep "$i" "$slide"
							count=$rtr
							if (( rtr != 0 )); then break; fi
						done
					;;
					"xlsx"*)
						# Strings
						zipGrep "$i" "xl/sharedStrings.xml"
						count=$rtr
						# Try looking into the individual worksheets' formula
						if (( $count == 0 )); then
							# Formulas
							for worksheet in $(zipinfo -1 "$i"|grep "xl/worksheets/sheet[0-9]*.xml"); do
								zipGrep "$i" "$worksheet"
								count=$rtr
								if (( rtr != 0 )); then break; fi
							done
						fi;
					;;
				esac
			;;
			*)
				continue
			;;
		esac
		if (($count>0)); then
			echo "$i"
		fi
	fi
done
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: