nicolas / debian.moreutils (public) (License: GPL-2, GPL-2+, Expat, BSD-2-Clause, Public Domain) (since 2018-09-25) (hash sha1)
Debian packaging of joeyh's moreutils

/isutf8.c (4306c7d7a22b230db58248e3317ada4a1f60bb34) (7296 bytes) (mode 100644) (type blob)

/*
 * isutf8.c - do the input files look like valid utf-8 byte streams?
 * 
 * Copyright (C) 2005  Lars Wirzenius
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <getopt.h>


#define VERSION "1.1"


/*
 * Code to indicate an invalid UTF8 character.
 */
enum { INVALID_CHAR = 0xffffffff };


/*
 * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
 * in the array 'buf'. Return the number of bytes in the encoded value.
 * If the value is too large (more than 32 bits or would take more than
 * 'maxbytes' bytes), return -1.
 */
static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
{
        static const struct {
            int nbytes;
            unsigned long max;
        } tab[] = {
            { 1, 0x0000007F },
            { 2, 0x000007FF },
            { 3, 0x0000FFFF },
            { 4, 0x001FFFFF },
            { 5, 0x03FFFFFF },
            { 6, 0x7FFFFFFF },
        };
        static const int ntab = sizeof(tab) / sizeof(tab[0]);
        int i, j;

        if (u > tab[ntab-1].max)
                return -1;

        for (i = 0; i < ntab; ++i) {
                if (u <= tab[i].max)
                    break;
        }
        assert(i < ntab);

        if (tab[i].nbytes > maxbytes)
                return -1;
        
        if (tab[i].nbytes == 1) { /* Special case */
                buf[0] = u;
        } else {
                for (j = tab[i].nbytes-1; j > 0; --j) {
                        buf[j] = 0x80 | (u & 0x3f);
                        u >>= 6;
                }
        
                unsigned char mask = ~(0xFF >> tab[i].nbytes);
                buf[0] = mask | u;
        }

        return tab[i].nbytes;
}


/* 
 * Return number of ones at the top of a byte.
 *
 * I'm pretty sure there is a fancy trick to do this without a loop,
 * but I'm too tired to figure it out now. --liw
 */
static int high_ones(int c) {
        int n;

        for (n = 0; (c & 0x80) == 0x80; c <<= 1)
                ++n;    
        return n;
}


/*
 * Decode a UTF8 character from an array of bytes. Return character code.
 * Upon error, return INVALID_CHAR.
 */
static unsigned long decodeutf8(unsigned char *buf, int nbytes)
{
        unsigned long u;
        int i, j;
        
        if (nbytes <= 0)
                return INVALID_CHAR;
        
        if (nbytes == 1) {
                if (buf[0] >= 0x80)
                        return INVALID_CHAR;
                return buf[0];
        }
        
        i = high_ones(buf[0]);
        if (i != nbytes)
                return INVALID_CHAR;    
        u = buf[0] & (0xff >> i);
        for (j = 1; j < nbytes; ++j) {
                if ((buf[j] & 0xC0) != 0x80)
                            return INVALID_CHAR;
                u = (u << 6) | (buf[j] & 0x3f);
        }
        return u;
}


/*
 * Determine if the contents of an open file form a valid UTF8 byte stream.
 * Do this by collecting bytes for a character into a buffer and then
 * decode the bytes and re-encode them and compare that they are identical
 * to the original bytes. If any step fails, return 0 for error. If EOF
 * is reached, return 1 for OK.
 */
static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
        enum { MAX_UTF8_BYTES = 6 };
        unsigned char buf[MAX_UTF8_BYTES];
        unsigned char buf2[MAX_UTF8_BYTES];
        int nbytes, nbytes2;
        int c;
        unsigned long code;
	unsigned long line, col, byteoff;

        nbytes = 0;
        line = 1;
        col = 1;
        byteoff = 0;
                
        for (;;) {
                c = getc(file);
    
                if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
                        /* New char starts, deal with previous one. */
                        if (nbytes > 0) {
                                code = decodeutf8(buf, nbytes);
                                if (code == INVALID_CHAR)
                                        goto error;
                                nbytes2 = encodeutf8(code, buf2, 
                                                     MAX_UTF8_BYTES);
                                if (nbytes != nbytes2 || 
                                    memcmp(buf, buf2, nbytes) != 0)
                                        goto error;
                                ++col;
                        }
                        nbytes = 0;
                        /* If it's UTF8, start collecting again. */
                        if (c != EOF && c >= 0x80)
                                buf[nbytes++] = c;
                } else {
                        /* This is a continuation byte, append to buffer. */
                        if (nbytes == MAX_UTF8_BYTES)
                                goto error;
                        buf[nbytes++] = c;
                }
    
                if (c == EOF)
                        break;
                else if (c == '\n') {
                        ++line;
                        byteoff = 0;
                        col = 1;
                } else
                        ++byteoff;
        }
        
        if (nbytes != 0)
                goto error;

	return 1;
	
error:
	if (!quiet) {
		printf("%s: line %lu, char %lu, byte offset %lu: "
		       "invalid UTF-8 code\n", filename, line, col, byteoff);
	}
	return 0;
}


static void usage(const char *program_name) {
	printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", 
	       program_name);
	printf("Check whether input files are valid UTF-8.\n");
	printf("This is version %s.\n", VERSION);
}


int main(int argc, char **argv) {
	int i, ok;
	FILE *file;

	int quiet;
	struct option options[] = {
		{ "help", no_argument, NULL, 'h' },
		{ "quiet", no_argument, &quiet, 1 },
	};
	int opt;
	
	quiet = 0;
	
	while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
		switch (opt) {
		case 0:
			break;
			
		case 'h':
			usage(argv[0]);
			exit(0);
			break;
			
		case 'q':
			quiet = 1;
			break;

		case '?':
			exit(EXIT_FAILURE);

		default:
			abort();
		}
	}

	if (optind == argc)
		ok = is_utf8_byte_stream(stdin, "stdin", quiet);
	else {
		ok = 1;
		for (i = optind; i < argc; ++i) {
			file = fopen(argv[i], "r");
			if (file == NULL) {
				fprintf(stderr, "isutf8: %s: error %d: %s\n", 
				                argv[i], errno, 
				                strerror(errno));
				ok = 0;
			} else {
			        if (! is_utf8_byte_stream(file, argv[i], quiet))
			            ok = 0;
				(void) fclose(file);
			}
		}
	}
	
	if (ok)
		exit(0);
	exit(EXIT_FAILURE);
}


Mode Type Size Ref File
100644 blob 17989 b7b5f53df1412df1e117607f18385b39004cdaa2 COPYING
100644 blob 933 b0595c417e33a8115ed8e2f62e50e9762d5cede7 Makefile
100644 blob 1048 ccc257cd3e6190cafffba1b73125677a16beea33 README
100755 blob 687 3abb315c7a9592c44f884cf5817dd062f9804f43 check-isutf8
100755 blob 2607 a695935b24a5f2789c71a8affc4486859a41f737 combine
040000 tree - d2320f6e716f8d093815fd86cc881880d24fcc60 debian
100644 blob 12767 f8fa0240cfe7d245c22dbd09d77be08e479ada1d ifdata.c
100644 blob 7039 5f2837f71cb7ea8e1aa8481d854d975174e19ab9 ifdata.docbook
100644 blob 3006 d8ecea9b8bc416154533572e1ce85a0385b7af10 ifne.c
100644 blob 2360 41fa9abe7a23b63f5afd110dcd0b3f78b0e4c531 ifne.docbook
100644 blob 7296 4306c7d7a22b230db58248e3317ada4a1f60bb34 isutf8.c
100644 blob 2894 f9c9eb59e9e15197e686a25a93d8785e4522696a isutf8.docbook
100644 blob 5471 4925409bd548b058f07defe913724868801040df lckdo.c
100644 blob 3261 8a0a4a863aba57a7a4d7b06b69414c25c21dfa17 lckdo.docbook
100644 blob 5777 43ba76aa6f3d24cfe8bb90b3776b044f27df9d37 mispipe.c
100644 blob 2292 b645b2c756f9b79cdde96a4a82c63bd9fd60fbff mispipe.docbook
100644 blob 1040 6ba38f78da10b61c8670b1c450fa769248ef84c4 pee.c
100644 blob 2082 18c753f289f920c9fddc06b191a2c7a031bbe391 pee.docbook
100644 blob 7301 a53a2cf1906998c91533f5f5435ceeeeb1a7cd59 physmem.c
100644 blob 8551 80733a22387f8290b77434f04a6a5dae099cc6b9 sponge.c
100644 blob 1757 f9395a72eb845f0c8007dd2777501a305243f579 sponge.docbook
100755 blob 2515 ca150b4a36105ee55ca72920d6adce1aafd9a05a ts
100755 blob 4495 a77739f27d8cab6843471de92857fe5064f9ace4 vidir
100755 blob 1260 8cee2b1775c19a3e7e046984a325eebfc8b94ecf vipe
100755 blob 2458 cedfbaee3b39348daa81758057c0adbe66374dbd zrun
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/nicolas/debian.moreutils

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/nicolas/debian.moreutils

Clone this repository using git:
git clone git://git.rocketgit.com/user/nicolas/debian.moreutils

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main