LexicalAnalyzerSelector.java

  1. /* Copyright 2002-2025 CS GROUP
  2.  * Licensed to CS GROUP (CS) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * CS licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *   http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.orekit.files.ccsds.utils.lexical;

  18. import java.io.BufferedInputStream;
  19. import java.io.BufferedReader;
  20. import java.io.IOException;
  21. import java.io.InputStream;
  22. import java.io.Reader;

  23. import org.orekit.data.DataSource;
  24. import org.orekit.errors.OrekitException;
  25. import org.orekit.errors.OrekitMessages;

  26. /** Utility class for selecting either {@link XmlLexicalAnalyzer} or {@link KvnLexicalAnalyzer} depending on
  27.  * data first bytes.
  28.  * @author Luc Maisonobe
  29.  * @since 11.0
  30.  */
  31. public class LexicalAnalyzerSelector {

  32.     /** Buffer size. */
  33.     private static final int BUFFER = 4096;

  34.     /** First bytes in XML document, UCS-4, big-endian, with Byte Order Mark. */
  35.     private static final byte[] UCS_4_BE_BOM = {
  36.         0x00, 0x00, -0x02, -0X01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
  37.     };

  38.     /** First bytes in XML document, UCS-4, little-endian, with Byte Order Mark. */
  39.     private static final byte[] UCS_4_LE_BOM = {
  40.         -0x01, -0X02, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
  41.     };

  42.     /** First bytes in XML document, UTF-16, big-endian, with Byte Order Mark. */
  43.     private static final byte[] UTF_16_BE_BOM = {
  44.         -0x02, -0X01, 0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
  45.     };

  46.     /** First bytes in XML document, UTF-16, little-endian, with Byte Order Mark. */
  47.     private static final byte[] UTF_16_LE_BOM = {
  48.         -0x01, -0X02, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
  49.     };

  50.     /** First bytes in XML document, UTF-8, endianness irrelevant, with Byte Order Mark. */
  51.     private static final byte[] UTF_8_BOM = {
  52.         -0x11, -0x45, -0x41, 0x3c, 0x3f, 0x78, 0x6d, 0x6c
  53.     };

  54.     /** First bytes in XML document, UCS-4, big-endian, without Byte Order Mark. */
  55.     private static final byte[] UCS_4_BE = {
  56.         0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c
  57.     };

  58.     /** First bytes in XML document, UCS-4, little-endian, without Byte Order Mark. */
  59.     private static final byte[] UCS_4_LE = {
  60.         0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00
  61.     };

  62.     /** First bytes in XML document, UTF-16, big-endian, without Byte Order Mark. */
  63.     private static final byte[] UTF_16_BE = {
  64.         0x00, 0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c
  65.     };

  66.     /** First bytes in XML document, UTF-16, little-endian, without Byte Order Mark. */
  67.     private static final byte[] UTF_16_LE = {
  68.         0x3c, 0x00, 0x3f, 0x00, 0x78, 0x00, 0x6d, 0x00, 0x6c, 0x00
  69.     };

  70.     /** First bytes in XML document, UTF-8, endianness irrelevant, without Byte Order Mark. */
  71.     private static final byte[] UTF_8 = {
  72.         0x3c, 0x3f, 0x78, 0x6d, 0x6c
  73.     };

  74.     /** First characters in XML document, with Byte Order Mark. */
  75.     private static final String CHARS_BOM = "\ufeff<?xml";

  76.     /** First characters in XML document, without Byte Order Mark. */
  77.     private static final String CHARS = "<?xml";

  78.     /** Private constructor for a utility class.
  79.      */
  80.     private LexicalAnalyzerSelector() {
  81.         // never called
  82.     }

  83.     /** Select a {@link LexicalAnalyzer} for a {@link DataSource} based on content.
  84.      * @param source data source to analyze
  85.      * @return lexical analyzer suited for the data source format
  86.      * @throws IOException if first bytes of source cannot be read
  87.      */
  88.     public static LexicalAnalyzer select(final DataSource source) throws IOException {
  89.         final DataSource.Opener opener = source.getOpener();
  90.         if (opener.rawDataIsBinary()) {
  91.             return select(source.getName(), opener.openStreamOnce());
  92.         } else {
  93.             return select(source.getName(), opener.openReaderOnce());
  94.         }
  95.     }

  96.     /** Select a {@link LexicalAnalyzer} based on content.
  97.      * @param name message name
  98.      * @param stream binary stream with message content
  99.      * @return lexical analyzer suited for the data source format
  100.      * @throws IOException if first bytes of source cannot be read
  101.      */
  102.     private static LexicalAnalyzer select(final String name, final InputStream stream) throws IOException {

  103.         if (stream == null) {
  104.             throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
  105.         }
  106.         final BufferedInputStream bis = new BufferedInputStream(stream, BUFFER);

  107.         // read the first bytes
  108.         final int size = UCS_4_BE_BOM.length; // UCS-4 with BOM is the longest reference sequence
  109.         bis.mark(size);
  110.         final byte[] first = new byte[size];
  111.         int read = 0;
  112.         while (read < first.length) {
  113.             final int n = bis.read(first, read, size - read);
  114.             if (n < 0) {
  115.                 // the file is too short for a proper CCSDS message,
  116.                 // we return arbitrarily a KVN lexical analyzer,
  117.                 // anyway, it will fail shortly during parsing
  118.                 bis.reset();
  119.                 return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
  120.             }
  121.             read += n;
  122.         }

  123.         // attempt to recognize an XML prolog, taking care of Byte Order Mark and encoding
  124.         // we use the tables from section F of Extensible Markup Language (XML) 1.0 (Fifth Edition)
  125.         // W3C Recommendation 26 November 2008 (https://www.w3.org/TR/2008/REC-xml-20081126/#sec-guessing),
  126.         // ignoring the unusual octet orders 2143 and 3412
  127.         if (checkSequence(first, UTF_8)     || checkSequence(first, UTF_8_BOM)     ||
  128.             checkSequence(first, UTF_16_LE) || checkSequence(first, UTF_16_LE_BOM) ||
  129.             checkSequence(first, UTF_16_BE) || checkSequence(first, UTF_16_BE_BOM) ||
  130.             checkSequence(first, UCS_4_LE)  || checkSequence(first, UCS_4_LE_BOM)  ||
  131.             checkSequence(first, UCS_4_BE)  || checkSequence(first, UCS_4_BE_BOM)) {
  132.             // we recognized the "<?xml" sequence at start of an XML file
  133.             bis.reset();
  134.             return new XmlLexicalAnalyzer(new DataSource(name, () -> bis));
  135.         } else {
  136.             // it was not XML, the only other option is KVN
  137.             bis.reset();
  138.             return new KvnLexicalAnalyzer(new DataSource(name, () -> bis));
  139.         }

  140.     }

  141.     /** Select a {@link LexicalAnalyzer} based on content.
  142.      * @param name message name
  143.      * @param reader character stream with message content
  144.      * @return lexical analyzer suited for the data source format
  145.      * @throws IOException if first bytes of source cannot be read
  146.      */
  147.     private static LexicalAnalyzer select(final String name, final Reader reader) throws IOException {

  148.         if (reader == null) {
  149.             throw new OrekitException(OrekitMessages.UNABLE_TO_FIND_FILE, name);
  150.         }
  151.         final BufferedReader br = new BufferedReader(reader, BUFFER);

  152.         // read the first characters
  153.         final int size = CHARS_BOM.length();
  154.         br.mark(size);
  155.         final char[] first = new char[size];
  156.         int read = 0;
  157.         while (read < first.length) {
  158.             final int n = br.read(first, read, size - read);
  159.             if (n < 0) {
  160.                 // the file is too short for a proper CCSDS message,
  161.                 // we return arbitrarily a KVN lexical analyzer,
  162.                 // anyway, it will fail shortly during parsing
  163.                 br.reset();
  164.                 return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
  165.             }
  166.             read += n;
  167.         }
  168.         final String firstString = new String(first);

  169.         // attempt to recognize an XML prolog
  170.         if (firstString.startsWith(CHARS) || CHARS_BOM.equals(firstString)) {
  171.             // we recognized the "<?xml" sequence at start of an XML file
  172.             br.reset();
  173.             return new XmlLexicalAnalyzer(new DataSource(name, () -> br));
  174.         } else {
  175.             // it was not XML, the only other option is KVN
  176.             br.reset();
  177.             return new KvnLexicalAnalyzer(new DataSource(name, () -> br));
  178.         }

  179.     }

  180.     /** Check if first bytes match reference sequence.
  181.      * @param first first bytes read
  182.      * @param reference reference sequence
  183.      * @return true if first bytes match reference sequence
  184.      */
  185.     private static boolean checkSequence(final byte[] first, final byte[] reference) {
  186.         for (int i = 0; i < reference.length; ++i) {
  187.             if (first[i] != reference[i]) {
  188.                 return false;
  189.             }
  190.         }
  191.         return true;
  192.     }

  193. }