#!/usr/bin/perl $docxfile = $ARGV[0]; $newline = "\n"; open(DOCUMENT, "unzip -p '$docxfile' word/document.xml |"); while ($line = ) { # convert paragraphs to newlines $line =~ s//\n\n/g; $line =~ s/]*>/\n\n/g; # chop some extra newlines out $line =~ s/\n\n\n\n/\n\n/g; $line =~ s/\n\n\n/\n\n/g; # substitute all other OOXML tags to nothing $line =~ s/<[^>]+>//g; # substitute all non-printing characters to nothing $line =~ s/[^[:print::space:]]+//g; # print finished product print $line; } print "\n"; close(DOCUMENT);