#-------------------------------------------------------------------
#  ParseJsComment.py
#
#  The ParseJsComment module.
#
#  Copyright 2016 Applied Invention, LLC
#-------------------------------------------------------------------

'''Find comments in a file.
'''

#-------------------------------------------------------------------
# Import statements go here.
#

#
# Import statements go above this line.
#-------------------------------------------------------------------


#-------------------------------------------------------------------
def findComments(text):
  '''Finds all comments in the text.
  @param text The text to search for comments in.

  @return a list of (beginIndex, endIndex) tuples.
  '''

  # pylint: disable = R0912, R0915, C0103

  # Possible states.

  # Other code.
  NORMAL = 0

  # Just saw an escape character.
  ESCAPED = 1

  # In a string.
  STRING = 2

  # In a multi-line comment.
  MULTI_COMMENT = 3

  # In a single-line comment.
  SINGLE_COMMENT = 4


  # List of (beginIndex, endIndex) tuples.
  comments = []

  # The current state.
  state = NORMAL

  # The start index of the comment we're in.
  startIndex = -1

  # The character that started the string we're in, or None if we're not
  # in a string.
  stringStartChar = None

  index = 0
  while index < len(text):
    ch = text[index]
    nextCh = None
    if index + 1 < len(text):
      nextCh = text[index + 1]

    startingEscape = False

    # A slash that starts an escape.
    if state == NORMAL and ch == '\\':
      startingEscape = True

    # Escaped end quote of a string.
    elif state == STRING and ch == '\\' and nextCh == stringStartChar:
      index += 1

    # End of a string.
    elif state == STRING and ch == stringStartChar:
      stringStartChar = None
      state = NORMAL

    # Beginning of a string.
    elif state == NORMAL and ch in ('"', "'"):
      state = STRING
      stringStartChar = ch

    # End of a multi-line comment.
    elif state == MULTI_COMMENT and ch == '*' and nextCh == '/':
      state = NORMAL
      index += 1

      comments.append((startIndex, index + 1))
      startIndex = -1

    # Beginning of a multi-line comment.
    elif state == NORMAL and ch == '/' and nextCh == "*":
      state = MULTI_COMMENT
      startIndex = index
      index += 1

    # End of a single-line comment.
    elif state == SINGLE_COMMENT and ch == '\n':
      state = NORMAL

      comments.append((startIndex, index + 1))
      startIndex = -1

    # Beginning of a single-line comment.
    elif state == NORMAL and ch == '/' and nextCh == '/':
      state = SINGLE_COMMENT
      startIndex = index
      index += 1

    # Beginning of a regex or a division operator.
    elif state == NORMAL and ch == '/' and nextCh != '/' and nextCh != '*':
      endIndex = findRegexEnd(text, index)
      if endIndex > -1:
        # It's a regex, not a division operation, so skip to the end.
        index = endIndex


    if startingEscape:
      state = ESCAPED
    elif state == ESCAPED:
      state = NORMAL

    index += 1

  # A single line commented that ended without a newline.
  if state == SINGLE_COMMENT:
    comments.append((startIndex, index))

  if state == STRING:
    msg = "Invalid Javascript:  "
    msg += "String started at %s never finished." % (startIndex,)
    raise ValueError(msg)

  if state == MULTI_COMMENT:
    msg = "Invalid Javascript:  "
    msg += "Comment started at %s never finished." % (startIndex,)
    raise ValueError(msg)

  return comments

#-------------------------------------------------------------------
def findRegexEnd(text, startIndex):
  '''Find the end of a regex.

  @param text The JS text.
  @param startIndex The index at which the regex starts.

  @return The index of the regex end, or -1 if the slash is not a regex.
  '''

  # In javascript, differentiating between a division operator and
  # a regex requires a full parse.
  # As a workaround, just see if there's another / in this line.
  # We may accidentally exclude some arithmatic operations,
  # but since we're just trying to find comments, this small
  # mis-parse should be OK.

  assert text[startIndex] == '/'

  endOfLine = text.find('\n', startIndex + 1)
  if endOfLine == -1:
    endOfLine = len(text)

  index = startIndex + 1

  # Possible states

  # pylint: disable = C0103
  NORMAL = 0
  ESCAPED = 1
  CHARACTER_CLASS = 2

  state = NORMAL

  # The state we were in before we saw an escape char.
  escapePreviousState = None

  while index < endOfLine:
    ch = text[index]

    startingEscape = False

    # A slash that starts an escape.
    if state == NORMAL and ch == '\\':
      startingEscape = True

    # The closing slash.
    elif state == NORMAL and ch == '/':
      return index

    # Start of a character class.
    elif state == NORMAL and ch == '[':
      state = CHARACTER_CLASS

    # Escape char in a character class.
    elif state == CHARACTER_CLASS and ch == '\\':
      startingEscape = True

    # End of a character class.
    elif state == CHARACTER_CLASS and ch == ']':
      state = NORMAL

    if startingEscape:
      escapePreviousState = state
      state = ESCAPED
    elif state == ESCAPED:
      state = escapePreviousState
      escapePreviousState = None

    index += 1

  return -1
