Lexer.java

  1. /* Copyright 2002-2025 CS GROUP
  2.  * Licensed to CS GROUP (CS) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * CS licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *   http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.orekit.utils.units;

  18. import org.hipparchus.fraction.Fraction;
  19. import org.orekit.errors.OrekitException;
  20. import org.orekit.errors.OrekitMessages;

  21. /** Lexer for units.
  22.  * @author Luc Maisonobe
  23.  * @since 11.0
  24.  */
  25. class Lexer {

  26.     /** Unit specification to tokenize. */
  27.     private final CharSequence unitSpecification;

  28.     /** End index. */
  29.     private final int end;

  30.     /** Start index for next token. */
  31.     private int start;

  32.     /** Next to last token emitted. */
  33.     private Token nextToLast;

  34.     /** Last token emitted. */
  35.     private Token last;

  36.     /** Upcoming token (which was pushed back). */
  37.     private Token upcoming;

  38.     /** Build a lexer for a unit specification.
  39.      * @param unitSpecification unit specification to tokenize
  40.      */
  41.     Lexer(final CharSequence unitSpecification) {
  42.         this.unitSpecification = unitSpecification;
  43.         this.end               = unitSpecification.length();
  44.         this.start             = 0;
  45.         this.last              = null;
  46.     }

  47.     /** Get the complete unit specification.
  48.      * @return complete unit specification
  49.      */
  50.     public String getUnitSpecification() {
  51.         return unitSpecification.toString();
  52.     }

  53.     /** Push back last returned token.
  54.      * <p>
  55.      * This can be called only once
  56.      * </p>
  57.      */
  58.     public void pushBack() {
  59.         upcoming = last;
  60.         last     = nextToLast;
  61.     }

  62.     /** Get next token.
  63.      * @return next token, or null if there are no more tokens
  64.      */
  65.     public Token next() {

  66.         if (upcoming != null) {
  67.             nextToLast = last;
  68.             last       = upcoming;
  69.             upcoming   = null;
  70.             return last;
  71.         }

  72.         // skip whitespace
  73.         while (start < end && Character.isWhitespace(unitSpecification.charAt(start))) {
  74.             ++start;
  75.         }

  76.         if (start >= end) {
  77.             // no more characters to analyze
  78.             nextToLast = last;
  79.             last       = null;
  80.             return null;
  81.         }

  82.         // look for prefixed units
  83.         int current = start;
  84.         while (current < end &&
  85.                (Character.isLowerCase(unitSpecification.charAt(current)) ||
  86.                 Character.isUpperCase(unitSpecification.charAt(current)) ||
  87.                 unitSpecification.charAt(current) == '°'  ||
  88.                 unitSpecification.charAt(current) == '◦'  ||
  89.                 unitSpecification.charAt(current) == '′'  ||
  90.                 unitSpecification.charAt(current) == '\'' ||
  91.                 unitSpecification.charAt(current) == '″'  ||
  92.                 unitSpecification.charAt(current) == '"'  ||
  93.                 unitSpecification.charAt(current) == '%'  ||
  94.                 unitSpecification.charAt(current) == '#')) {
  95.             ++current;
  96.         }
  97.         if (current > start) {
  98.             return emit(current, TokenType.IDENTIFIER, 0, 1);
  99.         }

  100.         // look for power
  101.         if (start < end - 1 &&
  102.             unitSpecification.charAt(start)     == '*' &&
  103.             unitSpecification.charAt(start + 1) == '*') {
  104.             // power indicator as **
  105.             return emit(start + 2, TokenType.POWER, 0, 1);
  106.         } else if (unitSpecification.charAt(start) == '^') {
  107.             // power indicator as ^
  108.             return emit(start + 1, TokenType.POWER, 0, 1);
  109.         } else if (convertSuperscript(start) != ' ' &&
  110.                    last != null &&
  111.                    last.getType() != TokenType.POWER) {
  112.             // virtual power indicator as we switch to superscript characters
  113.             return emit(start, TokenType.POWER, 0, 1);
  114.         }

  115.         // look for one character tokens
  116.         if (unitSpecification.charAt(start) == '*') {
  117.             return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
  118.         } else if (unitSpecification.charAt(start) == '×') {
  119.             return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
  120.         } else if (unitSpecification.charAt(start) == '.') {
  121.             return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
  122.         } else if (unitSpecification.charAt(start) == '·') {
  123.             return emit(start + 1, TokenType.MULTIPLICATION, 0, 1);
  124.         } else if (unitSpecification.charAt(start) == '/') {
  125.             return emit(start + 1, TokenType.DIVISION, 0, 1);
  126.         } else if (unitSpecification.charAt(start) == '⁄') {
  127.             return emit(start + 1, TokenType.DIVISION, 0, 1);
  128.         } else if (unitSpecification.charAt(start) == '(') {
  129.             return emit(start + 1, TokenType.OPEN, 0, 1);
  130.         } else if (unitSpecification.charAt(start) == ')') {
  131.             return emit(start + 1, TokenType.CLOSE, 0, 1);
  132.         } else if (unitSpecification.charAt(start) == '√') {
  133.             return emit(start + 1, TokenType.SQUARE_ROOT, 0, 1);
  134.         }

  135.         // look for special case "0.5" (used by CCSDS for square roots)
  136.         if (start < end - 2 &&
  137.              unitSpecification.charAt(start)     == '0' &&
  138.              unitSpecification.charAt(start + 1) == '.' &&
  139.              unitSpecification.charAt(start + 2) == '5') {
  140.             // ½ written as decimal number
  141.             return emit(start + 3, TokenType.FRACTION, 1, 2);
  142.         }

  143.         // look for special case "1.5" (used by CCSDS for power 3/2)
  144.         if (start < end - 2 &&
  145.              unitSpecification.charAt(start)     == '1' &&
  146.              unitSpecification.charAt(start + 1) == '.' &&
  147.              unitSpecification.charAt(start + 2) == '5') {
  148.             // 3/2 written as decimal number
  149.             return emit(start + 3, TokenType.FRACTION, 3, 2);
  150.         }

  151.         // look for unicode fractions
  152.         if (unitSpecification.charAt(start) == '¼') {
  153.             return emit(start + 1, TokenType.FRACTION, 1, 4);
  154.         } else if (unitSpecification.charAt(start) == '½') {
  155.             return emit(start + 1, TokenType.FRACTION, 1, 2);
  156.         } else if (unitSpecification.charAt(start) == '¾') {
  157.             return emit(start + 1, TokenType.FRACTION, 3, 4);
  158.         } else if (unitSpecification.charAt(start) == '⅐') {
  159.             return emit(start + 1, TokenType.FRACTION, 1, 7);
  160.         } else if (unitSpecification.charAt(start) == '⅑') {
  161.             return emit(start + 1, TokenType.FRACTION, 1, 9);
  162.         } else if (unitSpecification.charAt(start) == '⅒') {
  163.             return emit(start + 1, TokenType.FRACTION, 1, 10);
  164.         } else if (unitSpecification.charAt(start) == '⅓') {
  165.             return emit(start + 1, TokenType.FRACTION, 1, 3);
  166.         } else if (unitSpecification.charAt(start) == '⅔') {
  167.             return emit(start + 1, TokenType.FRACTION, 2, 3);
  168.         } else if (unitSpecification.charAt(start) == '⅕') {
  169.             return emit(start + 1, TokenType.FRACTION, 1, 5);
  170.         } else if (unitSpecification.charAt(start) == '⅖') {
  171.             return emit(start + 1, TokenType.FRACTION, 2, 5);
  172.         } else if (unitSpecification.charAt(start) == '⅗') {
  173.             return emit(start + 1, TokenType.FRACTION, 3, 5);
  174.         } else if (unitSpecification.charAt(start) == '⅘') {
  175.             return emit(start + 1, TokenType.FRACTION, 4, 5);
  176.         } else if (unitSpecification.charAt(start) == '⅙') {
  177.             return emit(start + 1, TokenType.FRACTION, 1, 6);
  178.         } else if (unitSpecification.charAt(start) == '⅚') {
  179.             return emit(start + 1, TokenType.FRACTION, 5, 6);
  180.         } else if (unitSpecification.charAt(start) == '⅛') {
  181.             return emit(start + 1, TokenType.FRACTION, 1, 8);
  182.         } else if (unitSpecification.charAt(start) == '⅜') {
  183.             return emit(start + 1, TokenType.FRACTION, 3, 8);
  184.         } else if (unitSpecification.charAt(start) == '⅝') {
  185.             return emit(start + 1, TokenType.FRACTION, 5, 8);
  186.         } else if (unitSpecification.charAt(start) == '⅞') {
  187.             return emit(start + 1, TokenType.FRACTION, 7, 8);
  188.         }

  189.         // it must be an integer, either as regular character or as superscript
  190.         final Converter converter = (convertSuperscript(start) == ' ') ?
  191.                                     this::noConvert :
  192.                                     this::convertSuperscript;

  193.         // manage sign, taking care of counting characters properly
  194.         final int sign;
  195.         final int numberStart;
  196.         if (converter.convert(start) == '+') {
  197.             sign        = +1;
  198.             numberStart = start + 1;
  199.         } else if (converter.convert(start) == '-') {
  200.             sign        = -1;
  201.             numberStart = start + 1;
  202.         } else {
  203.             sign        = 1;
  204.             numberStart = start;
  205.         }
  206.         current = numberStart;

  207.         int value = 0;
  208.         while (current < end) {
  209.             final int c = converter.convert(current);
  210.             if (c >= '0' && c <= '9') {
  211.                 value = value * 10 + (c - '0');
  212.                 ++current;
  213.             } else {
  214.                 break;
  215.             }
  216.         }
  217.         if (current > numberStart) {
  218.             // there were some digits
  219.             return emit(current, TokenType.INTEGER, sign * value, 1);
  220.         }

  221.         throw generateException();

  222.     }

  223.     /** Generate an exception.
  224.      * @return generated exception
  225.      */
  226.     public OrekitException generateException() {
  227.         return new OrekitException(OrekitMessages.UNKNOWN_UNIT, unitSpecification);
  228.     }

  229.     /** Emit one token.
  230.      * @param after index after token
  231.      * @param type token type
  232.      * @param numerator value of the token numerator
  233.      * @param denominator value of the token denominator
  234.      * @return new token
  235.      */
  236.     private Token emit(final int after, final TokenType type, final int numerator, final int denominator) {
  237.         final CharSequence subString = unitSpecification.subSequence(start, after);
  238.         start      = after;
  239.         nextToLast = last;
  240.         last       = new Token(subString, type, numerator,
  241.                                denominator == 1 ? null : new Fraction(numerator, denominator));
  242.         return last;
  243.     }

  244.     /** Convert a superscript character to regular digit or sign character.
  245.      * @param index character index
  246.      * @return regular digit or sign character, or ' ' if character is not a superscript
  247.      */
  248.     private char convertSuperscript(final int index) {
  249.         // we can't do fancy stuff with code points
  250.         // superscripts for 1, 2 and 3 are not in the same range as others
  251.         switch (unitSpecification.charAt(index)) {
  252.             case '⁰' :
  253.                 return '0';
  254.             case '¹' :
  255.                 return '1';
  256.             case '²' :
  257.                 return '2';
  258.             case '³' :
  259.                 return '3';
  260.             case '⁴' :
  261.                 return '4';
  262.             case '⁵' :
  263.                 return '5';
  264.             case '⁶' :
  265.                 return '6';
  266.             case '⁷' :
  267.                 return '7';
  268.             case '⁸' :
  269.                 return '8';
  270.             case '⁹' :
  271.                 return '9';
  272.             case '⁺' :
  273.                 return '+';
  274.             case '⁻' :
  275.                 return '-';
  276.             default :
  277.                 return ' ';
  278.         }

  279.     }

  280.     /** No-op converter.
  281.      * @param index character index
  282.      * @return character at index
  283.      */
  284.     private char noConvert(final int index) {
  285.         return unitSpecification.charAt(index);
  286.     }

  287.     /** Character converter. */
  288.     private interface Converter {
  289.         /** Convert a character.
  290.          * @param index character index
  291.          * @return converted character
  292.          */
  293.         char convert(int index);
  294.     }

  295. }