RocketGit

nicolas / debian.moreutils (public) (License: GPL-2, GPL-2+, Expat, BSD-2-Clause, Public Domain) (since 2018-09-25) (hash sha1)

Debian packaging of joeyh's moreutils

Clone URLs: https://rocketgit.com/user/nicolas/debian.moreutils ssh://rocketgit@ssh.rocketgit.com/user/nicolas/debian.moreutils git://git.rocketgit.com/user/nicolas/debian.moreutils

debian/0.50 debian/0.51 debian/0.52 debian/0.53 debian/0.54-1 debian/0.55-1 debian/0.55-2 debian/0.56-1 debian/0.57-1 debian/0.58-1 debian/0.59-1 debian/0.62-1 upstream/0.28 upstream/0.29 upstream/0.30 upstream/0.31 upstream/0.32 upstream/0.33 upstream/0.34 upstream/0.35 upstream/0.36 upstream/0.37 upstream/0.38 upstream/0.39 upstream/0.40 upstream/0.41 upstream/0.42 upstream/0.43 upstream/0.44 upstream/0.45 upstream/0.46 upstream/0.47 upstream/0.48 upstream/0.49 upstream/0.50 upstream/0.51 upstream/0.52 upstream/0.53 upstream/0.54 upstream/0.55 upstream/0.56 upstream/0.57 upstream/0.58 upstream/0.59 upstream/0.60 upstream/0.61 upstream/0.62 upstream/debian/0.50 upstream/debian/0.51 upstream/debian/0.52 upstream/debian/0.53 upstream/debian/0.54-1 upstream/debian/0.55-1 upstream/debian/0.55-2 auto-test master

/isutf8.c (c5f5eeb667c425c3ef02516712c08acb72f3f557) (7581 bytes) (mode 100644) (type blob)

/*
 * isutf8.c - do the input files look like valid utf-8 byte streams?
 * 
 * Copyright (C) 2005  Lars Wirzenius
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <getopt.h>


#define VERSION "1.1"


/*
 * Code to indicate an invalid UTF8 character.
 */
enum { INVALID_CHAR = 0xffffffff };


/*
 * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
 * in the array 'buf'. Return the number of bytes in the encoded value.
 * If the value is too large (more than 32 bits or would take more than
 * 'maxbytes' bytes), return -1.
 */
static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
{
        static const struct {
            int nbytes;
            unsigned long max;
        } tab[] = {
            { 1, 0x0000007F },
            { 2, 0x000007FF },
            { 3, 0x0000FFFF },
            { 4, 0x001FFFFF },
            { 5, 0x03FFFFFF },
            { 6, 0x7FFFFFFF },
        };
        static const int ntab = sizeof(tab) / sizeof(tab[0]);
        int i, j;

        if (u > tab[ntab-1].max)
                return -1;

        for (i = 0; i < ntab; ++i) {
                if (u <= tab[i].max)
                    break;
        }
        assert(i < ntab);

        if (tab[i].nbytes > maxbytes)
                return -1;
        
        if (tab[i].nbytes == 1) { /* Special case */
                buf[0] = u;
        } else {
                for (j = tab[i].nbytes-1; j > 0; --j) {
                        buf[j] = 0x80 | (u & 0x3f);
                        u >>= 6;
                }
        
                unsigned char mask = ~(0xFF >> tab[i].nbytes);
                buf[0] = mask | u;
        }

        return tab[i].nbytes;
}


/* 
 * Return number of ones at the top of a byte.
 *
 * I'm pretty sure there is a fancy trick to do this without a loop,
 * but I'm too tired to figure it out now. --liw
 */
static int high_ones(int c) {
        int n;

        for (n = 0; (c & 0x80) == 0x80; c <<= 1)
                ++n;    
        return n;
}


/*
 * Decode a UTF8 character from an array of bytes. Return character code.
 * Upon error, return INVALID_CHAR.
 */
static unsigned long decodeutf8(unsigned char *buf, int nbytes)
{
        unsigned long u;
        int i, j;
        
        if (nbytes <= 0)
                return INVALID_CHAR;
        
        if (nbytes == 1) {
                if (buf[0] >= 0x80)
                        return INVALID_CHAR;
                return buf[0];
        }
        
        i = high_ones(buf[0]);
        if (i != nbytes)
                return INVALID_CHAR;    
        u = buf[0] & (0xff >> i);
        for (j = 1; j < nbytes; ++j) {
                if ((buf[j] & 0xC0) != 0x80)
                            return INVALID_CHAR;
                u = (u << 6) | (buf[j] & 0x3f);
        }

        /* Conforming UTF-8 cannot contain codes 0xd800–0xdfff (UTF-16 
           surrogates) as well as 0xfffe and 0xffff. */
        if (u >= 0xD800 && u <= 0xDFFF)
            return INVALID_CHAR;
        if (u == 0xFFFE || u == 0xFFFF)
            return INVALID_CHAR;

        return u;
}


/*
 * Determine if the contents of an open file form a valid UTF8 byte stream.
 * Do this by collecting bytes for a character into a buffer and then
 * decode the bytes and re-encode them and compare that they are identical
 * to the original bytes. If any step fails, return 0 for error. If EOF
 * is reached, return 1 for OK.
 */
static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
        enum { MAX_UTF8_BYTES = 6 };
        unsigned char buf[MAX_UTF8_BYTES];
        unsigned char buf2[MAX_UTF8_BYTES];
        int nbytes, nbytes2;
        int c;
        unsigned long code;
        unsigned long line, col, byteoff;

        nbytes = 0;
        line = 1;
        col = 1;
        byteoff = 0;
                
        for (;;) {
                c = getc(file);
    
                if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
                        /* New char starts, deal with previous one. */
                        if (nbytes > 0) {
                                code = decodeutf8(buf, nbytes);
                                if (code == INVALID_CHAR)
                                        goto error;
                                nbytes2 = encodeutf8(code, buf2, 
                                                     MAX_UTF8_BYTES);
                                if (nbytes != nbytes2 || 
                                    memcmp(buf, buf2, nbytes) != 0)
                                        goto error;
                                ++col;
                        }
                        nbytes = 0;
                        /* If it's UTF8, start collecting again. */
                        if (c != EOF && c >= 0x80)
                                buf[nbytes++] = c;
                } else {
                        /* This is a continuation byte, append to buffer. */
                        if (nbytes == MAX_UTF8_BYTES)
                                goto error;
                        buf[nbytes++] = c;
                }
    
                if (c == EOF)
                        break;
                else if (c == '\n') {
                        ++line;
                        byteoff = 0;
                        col = 1;
                } else
                        ++byteoff;
        }
        
        if (nbytes != 0)
                goto error;

	return 1;
	
error:
	if (!quiet) {
		printf("%s: line %lu, char %lu, byte offset %lu: "
		       "invalid UTF-8 code\n", filename, line, col, byteoff);
	}
	return 0;
}


static void usage(const char *program_name) {
	printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", 
	       program_name);
	printf("Check whether input files are valid UTF-8.\n");
	printf("This is version %s.\n", VERSION);
}


int main(int argc, char **argv) {
	int i, ok;
	FILE *file;

	int quiet;
	struct option options[] = {
		{ "help", no_argument, NULL, 'h' },
		{ "quiet", no_argument, &quiet, 1 },
	};
	int opt;
	
	quiet = 0;
	
	while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
		switch (opt) {
		case 0:
			break;
			
		case 'h':
			usage(argv[0]);
			exit(0);
			break;
			
		case 'q':
			quiet = 1;
			break;

		case '?':
			exit(EXIT_FAILURE);

		default:
			abort();
		}
	}

	if (optind == argc)
		ok = is_utf8_byte_stream(stdin, "stdin", quiet);
	else {
		ok = 1;
		for (i = optind; i < argc; ++i) {
			file = fopen(argv[i], "r");
			if (file == NULL) {
				fprintf(stderr, "isutf8: %s: error %d: %s\n", 
				                argv[i], errno, 
				                strerror(errno));
				ok = 0;
			} else {
			        if (! is_utf8_byte_stream(file, argv[i], quiet))
			            ok = 0;
				(void) fclose(file);
			}
		}
	}
	
	if (ok)
		exit(0);
	exit(EXIT_FAILURE);
}

Mode	Type	Size	Ref	File
100644	blob	44	5d425843f23db3bb6970a55c953f345e3a8c8fe1	.gitattributes
100644	blob	17989	b7b5f53df1412df1e117607f18385b39004cdaa2	COPYING
100644	blob	1038	377121d56b79b9fbc42cabe86f4c3bae0d44bd2f	Makefile
100644	blob	1106	548acf2240b3ea1ad6276e39ec195133a8b6d4c6	README
100755	blob	806	83a4eed00f82e3bcc81856149b47cffc4091f9aa	check-isutf8
100755	blob	2607	a695935b24a5f2789c71a8affc4486859a41f737	combine
040000	tree	-	eebe48e402250fe244bf2856a5f7434178808e51	debian
100644	blob	13073	2de98a0b19372bff63be861b5adc755fa52fc74d	ifdata.c
100644	blob	7234	963943ee1bd1ae2ae3b087663a5e1d6cd961f246	ifdata.docbook
100644	blob	3006	d8ecea9b8bc416154533572e1ce85a0385b7af10	ifne.c
100644	blob	2360	41fa9abe7a23b63f5afd110dcd0b3f78b0e4c531	ifne.docbook
100644	blob	7581	c5f5eeb667c425c3ef02516712c08acb72f3f557	isutf8.c
100644	blob	2986	58355a2cd465fe70dd7722c28c1dc62ed85ef7e8	isutf8.docbook
100644	blob	5471	4925409bd548b058f07defe913724868801040df	lckdo.c
100644	blob	3532	effe84d29436f6749b7b0614b6a59c700f287ccf	lckdo.docbook
100644	blob	5783	d183d04a5f249072da9be3e1d30d4e205e1be021	mispipe.c
100644	blob	2464	bd8faa8601fe24b72f93985249be2163513f44f7	mispipe.docbook
100644	blob	5318	d283b96101fbc89ef214436bab316cf1092536f9	parallel.c
100644	blob	3598	d3ffcce639cc0880bb1aebb973354d7a6b54dbcd	parallel.docbook
100644	blob	1040	6ba38f78da10b61c8670b1c450fa769248ef84c4	pee.c
100644	blob	2225	fcb159aa0f9e44536c321b92e0ad589be04fd856	pee.docbook
100644	blob	7301	a53a2cf1906998c91533f5f5435ceeeeb1a7cd59	physmem.c
100644	blob	8570	242298deb641650bd25e8b68b52ca72f82becfa4	sponge.c
100644	blob	1859	24c432ac20eb502e58cf609037a3cfa4d1dc7a5f	sponge.docbook
100755	blob	2572	042cc18b8a0b4649874b229c039b7b9d79806bfc	ts
100755	blob	4495	a77739f27d8cab6843471de92857fe5064f9ace4	vidir
100755	blob	1402	fd61049c5ce903a1de06870cadf18a7c2b1e4137	vipe
100755	blob	2518	98d1445a5f8106f04be690f85d802a7f6decfd13	zrun

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/nicolas/debian.moreutils

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/nicolas/debian.moreutils

Clone this repository using git:

git clone git://git.rocketgit.com/user/nicolas/debian.moreutils

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main