R.Muralikrishnan, MPI for Empirical Aesthetics. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Licence. #!/usr/bin/awk -f ####################################################O # AWK Script to work out stanza-wise positions and # relative positions of words # # Execute as: awk -f Extract-Stanzawise-Word-Positions.awk Input-Poems-Stanzawise-1.txt > Stanzawise-Word-Positions.txt # In the input file: 1. All instances of ' -' and ' -emdash' were removed. # 2. Empty lines (containing just \n) have been marked with a # # This will be used here as the Record Separator, so the whole stanza will be read in as a line. # # Author: R.Muralikrishnan ####################################################m BEGIN{ OFS = "\t"; RS = "#"; } { gsub("\r\n"," ", $0); # Replace CR+LF characters (IN THAT ORDER) with a space... otherwise, it will be counted as a field, because now RS = "#", and not the default"\n" or "\r\n"; # The input file had Windows line-end character, so CR+LF (\r\n IN THAT ORDER); # This works actually very handy for our purposes, for replacing CR+LF here with a space enables us # to look at the last word of line x and the first word of line x+1 as separate words. # Also, any blank lines in the input are not included as part of the first word of the next line. # If the input had UNIX line-end character (so just LF, \n), then the gsub command should be adapted appropriately. for (i = 1; i <= NF; i++) { if ($1 != 0) # $1 = 0 on the line with the poem's title { print $i, i, i/NF; # Print the word, it's position in the stanza, and it's relative position in the stanza; } else if ($i != 0) # Don't print the 'title marker 0'...just print the title and author names { print $i, 0, 0; # The poem's title and author names will be printed with 0 as the position information } } }