lib/Format/Encoding.h

1.1  joerg //===--- Encoding.h - Format C++ code ---------------------------*- C++ -*-===//
1.1  joerg //
1.1  joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
1.1  joerg // See https://llvm.org/LICENSE.txt for license information.
1.1  joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
1.1  joerg //
1.1  joerg //===----------------------------------------------------------------------===//
1.1  joerg ///
1.1  joerg /// \file
1.1  joerg /// Contains functions for text encoding manipulation. Supports UTF-8,
1.1  joerg /// 8-bit encodings and escape sequences in C++ string literals.
1.1  joerg ///
1.1  joerg //===----------------------------------------------------------------------===//
1.1  joerg
1.1  joerg #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
1.1  joerg #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
1.1  joerg
1.1  joerg #include "clang/Basic/LLVM.h"
1.1  joerg #include "llvm/ADT/StringRef.h"
1.1  joerg #include "llvm/Support/ConvertUTF.h"
1.1  joerg #include "llvm/Support/Unicode.h"
1.1  joerg
1.1  joerg namespace clang {
1.1  joerg namespace format {
1.1  joerg namespace encoding {
1.1  joerg
1.1  joerg enum Encoding {
1.1  joerg   Encoding_UTF8,
1.1  joerg   Encoding_Unknown // We treat all other encodings as 8-bit encodings.
1.1  joerg };
1.1  joerg
1.1  joerg /// Detects encoding of the Text. If the Text can be decoded using UTF-8,
1.1  joerg /// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
1.1  joerg inline Encoding detectEncoding(StringRef Text) {
1.1  joerg   const llvm::UTF8 *Ptr = reinterpret_cast<const llvm::UTF8 *>(Text.begin());
1.1  joerg   const llvm::UTF8 *BufEnd = reinterpret_cast<const llvm::UTF8 *>(Text.end());
1.1  joerg   if (llvm::isLegalUTF8String(&Ptr, BufEnd))
1.1  joerg     return Encoding_UTF8;
1.1  joerg   return Encoding_Unknown;
1.1  joerg }
1.1  joerg
1.1  joerg /// Returns the number of columns required to display the \p Text on a
1.1  joerg /// generic Unicode-capable terminal. Text is assumed to use the specified
1.1  joerg /// \p Encoding.
1.1  joerg inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
1.1  joerg   if (Encoding == Encoding_UTF8) {
1.1  joerg     int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
1.1  joerg     // FIXME: Figure out the correct way to handle this in the presence of both
1.1  joerg     // printable and unprintable multi-byte UTF-8 characters. Falling back to
1.1  joerg     // returning the number of bytes may cause problems, as columnWidth suddenly
1.1  joerg     // becomes non-additive.
1.1  joerg     if (ContentWidth >= 0)
1.1  joerg       return ContentWidth;
1.1  joerg   }
1.1  joerg   return Text.size();
1.1  joerg }
1.1  joerg
1.1  joerg /// Returns the number of columns required to display the \p Text,
1.1  joerg /// starting from the \p StartColumn on a terminal with the \p TabWidth. The
1.1  joerg /// text is assumed to use the specified \p Encoding.
1.1  joerg inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
1.1  joerg                                     unsigned TabWidth, Encoding Encoding) {
1.1  joerg   unsigned TotalWidth = 0;
1.1  joerg   StringRef Tail = Text;
1.1  joerg   for (;;) {
1.1  joerg     StringRef::size_type TabPos = Tail.find('\t');
1.1  joerg     if (TabPos == StringRef::npos)
1.1  joerg       return TotalWidth + columnWidth(Tail, Encoding);
1.1  joerg     TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
1.1  joerg     if (TabWidth)
1.1  joerg       TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
1.1  joerg     Tail = Tail.substr(TabPos + 1);
1.1  joerg   }
1.1  joerg }
1.1  joerg
1.1  joerg /// Gets the number of bytes in a sequence representing a single
1.1  joerg /// codepoint and starting with FirstChar in the specified Encoding.
1.1  joerg inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
1.1  joerg   switch (Encoding) {
1.1  joerg   case Encoding_UTF8:
1.1  joerg     return llvm::getNumBytesForUTF8(FirstChar);
1.1  joerg   default:
1.1  joerg     return 1;
1.1  joerg   }
1.1  joerg }
1.1  joerg
1.1  joerg inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
1.1  joerg
1.1  joerg inline bool isHexDigit(char c) {
1.1  joerg   return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
1.1  joerg          ('A' <= c && c <= 'F');
1.1  joerg }
1.1  joerg
1.1  joerg /// Gets the length of an escape sequence inside a C++ string literal.
1.1  joerg /// Text should span from the beginning of the escape sequence (starting with a
1.1  joerg /// backslash) to the end of the string literal.
1.1  joerg inline unsigned getEscapeSequenceLength(StringRef Text) {
1.1  joerg   assert(Text[0] == '\\');
1.1  joerg   if (Text.size() < 2)
1.1  joerg     return 1;
1.1  joerg
1.1  joerg   switch (Text[1]) {
1.1  joerg   case 'u':
1.1  joerg     return 6;
1.1  joerg   case 'U':
1.1  joerg     return 10;
1.1  joerg   case 'x': {
1.1  joerg     unsigned I = 2; // Point after '\x'.
1.1  joerg     while (I < Text.size() && isHexDigit(Text[I]))
1.1  joerg       ++I;
1.1  joerg     return I;
1.1  joerg   }
1.1  joerg   default:
1.1  joerg     if (isOctDigit(Text[1])) {
1.1  joerg       unsigned I = 1;
1.1  joerg       while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
1.1  joerg         ++I;
1.1  joerg       return I;
1.1  joerg     }
1.1  joerg     return 1 + llvm::getNumBytesForUTF8(Text[1]);
1.1  joerg   }
1.1  joerg }
1.1  joerg
1.1  joerg } // namespace encoding
1.1  joerg } // namespace format
1.1  joerg } // namespace clang
1.1  joerg
1.1  joerg #endif