Creative Commons License R.Muralikrishnan, MPI for Empirical Aesthetics. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Licence.
#!/usr/bin/awk -f
####################################################O
# AWK Script to work out stanza-wise positions and
#            relative positions of words
#
# Execute as: awk -f Extract-Stanzawise-Word-Positions.awk Input-Poems-Stanzawise-1.txt > Stanzawise-Word-Positions.txt
# In the input file: 1. All instances of ' -' and ' -emdash' were removed.
#                    2. Empty lines (containing just \n) have been marked with a #
#                       This will be used here as the Record Separator, so the whole stanza will be read in as a line.
#
# Author: R.Muralikrishnan
####################################################m

BEGIN{ OFS = "\t"; RS = "#"; }

{
  gsub("\r\n"," ", $0);  # Replace CR+LF characters (IN THAT ORDER) with a space... otherwise, it will be counted as a field, because now RS = "#", and not the default"\n" or "\r\n";

  # The input file had Windows line-end character, so CR+LF (\r\n IN THAT ORDER);
  # This works actually very handy for our purposes, for replacing CR+LF here with a space enables us
  # to look at the last word of line x and the first word of line x+1 as separate words.
  # Also, any blank lines in the input are not included as part of the first word of the next line.
  # If the input had UNIX line-end character (so just LF, \n), then the gsub command should be adapted appropriately.


  for (i = 1; i <= NF; i++)
  {

    if ($1 != 0) # $1 = 0 on the line with the poem's title
    {

      print $i, i, i/NF;  # Print the word, it's position in the stanza, and it's relative position in the stanza;

    }
    else if ($i != 0) # Don't print the 'title marker 0'...just print the title and author names
    {
      print $i, 0, 0; # The poem's title and author names will be printed with 0 as the position information

    }

  }

}