RocketGit

nicolas / debian.moreutils (public) (License: GPL-2, GPL-2+, Expat, BSD-2-Clause, Public Domain) (since 2018-09-25) (hash sha1)

Debian packaging of joeyh's moreutils

Clone URLs: https://rocketgit.com/user/nicolas/debian.moreutils ssh://rocketgit@ssh.rocketgit.com/user/nicolas/debian.moreutils git://git.rocketgit.com/user/nicolas/debian.moreutils

debian/0.50 debian/0.51 debian/0.52 debian/0.53 debian/0.54-1 debian/0.55-1 debian/0.55-2 debian/0.56-1 debian/0.57-1 debian/0.58-1 debian/0.59-1 debian/0.62-1 upstream/0.28 upstream/0.29 upstream/0.30 upstream/0.31 upstream/0.32 upstream/0.33 upstream/0.34 upstream/0.35 upstream/0.36 upstream/0.37 upstream/0.38 upstream/0.39 upstream/0.40 upstream/0.41 upstream/0.42 upstream/0.43 upstream/0.44 upstream/0.45 upstream/0.46 upstream/0.47 upstream/0.48 upstream/0.49 upstream/0.50 upstream/0.51 upstream/0.52 upstream/0.53 upstream/0.54 upstream/0.55 upstream/0.56 upstream/0.57 upstream/0.58 upstream/0.59 upstream/0.60 upstream/0.61 upstream/0.62 upstream/debian/0.50 upstream/debian/0.51 upstream/debian/0.52 upstream/debian/0.53 upstream/debian/0.54-1 upstream/debian/0.55-1 upstream/debian/0.55-2 auto-test master

/isutf8.c (4306c7d7a22b230db58248e3317ada4a1f60bb34) (7296 bytes) (mode 100644) (type blob)

/*
 * isutf8.c - do the input files look like valid utf-8 byte streams?
 * 
 * Copyright (C) 2005  Lars Wirzenius
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <getopt.h>


#define VERSION "1.1"


/*
 * Code to indicate an invalid UTF8 character.
 */
enum { INVALID_CHAR = 0xffffffff };


/*
 * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
 * in the array 'buf'. Return the number of bytes in the encoded value.
 * If the value is too large (more than 32 bits or would take more than
 * 'maxbytes' bytes), return -1.
 */
static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
{
        static const struct {
            int nbytes;
            unsigned long max;
        } tab[] = {
            { 1, 0x0000007F },
            { 2, 0x000007FF },
            { 3, 0x0000FFFF },
            { 4, 0x001FFFFF },
            { 5, 0x03FFFFFF },
            { 6, 0x7FFFFFFF },
        };
        static const int ntab = sizeof(tab) / sizeof(tab[0]);
        int i, j;

        if (u > tab[ntab-1].max)
                return -1;

        for (i = 0; i < ntab; ++i) {
                if (u <= tab[i].max)
                    break;
        }
        assert(i < ntab);

        if (tab[i].nbytes > maxbytes)
                return -1;
        
        if (tab[i].nbytes == 1) { /* Special case */
                buf[0] = u;
        } else {
                for (j = tab[i].nbytes-1; j > 0; --j) {
                        buf[j] = 0x80 | (u & 0x3f);
                        u >>= 6;
                }
        
                unsigned char mask = ~(0xFF >> tab[i].nbytes);
                buf[0] = mask | u;
        }

        return tab[i].nbytes;
}


/* 
 * Return number of ones at the top of a byte.
 *
 * I'm pretty sure there is a fancy trick to do this without a loop,
 * but I'm too tired to figure it out now. --liw
 */
static int high_ones(int c) {
        int n;

        for (n = 0; (c & 0x80) == 0x80; c <<= 1)
                ++n;    
        return n;
}


/*
 * Decode a UTF8 character from an array of bytes. Return character code.
 * Upon error, return INVALID_CHAR.
 */
static unsigned long decodeutf8(unsigned char *buf, int nbytes)
{
        unsigned long u;
        int i, j;
        
        if (nbytes <= 0)
                return INVALID_CHAR;
        
        if (nbytes == 1) {
                if (buf[0] >= 0x80)
                        return INVALID_CHAR;
                return buf[0];
        }
        
        i = high_ones(buf[0]);
        if (i != nbytes)
                return INVALID_CHAR;    
        u = buf[0] & (0xff >> i);
        for (j = 1; j < nbytes; ++j) {
                if ((buf[j] & 0xC0) != 0x80)
                            return INVALID_CHAR;
                u = (u << 6) | (buf[j] & 0x3f);
        }
        return u;
}


/*
 * Determine if the contents of an open file form a valid UTF8 byte stream.
 * Do this by collecting bytes for a character into a buffer and then
 * decode the bytes and re-encode them and compare that they are identical
 * to the original bytes. If any step fails, return 0 for error. If EOF
 * is reached, return 1 for OK.
 */
static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
        enum { MAX_UTF8_BYTES = 6 };
        unsigned char buf[MAX_UTF8_BYTES];
        unsigned char buf2[MAX_UTF8_BYTES];
        int nbytes, nbytes2;
        int c;
        unsigned long code;
	unsigned long line, col, byteoff;

        nbytes = 0;
        line = 1;
        col = 1;
        byteoff = 0;
                
        for (;;) {
                c = getc(file);
    
                if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
                        /* New char starts, deal with previous one. */
                        if (nbytes > 0) {
                                code = decodeutf8(buf, nbytes);
                                if (code == INVALID_CHAR)
                                        goto error;
                                nbytes2 = encodeutf8(code, buf2, 
                                                     MAX_UTF8_BYTES);
                                if (nbytes != nbytes2 || 
                                    memcmp(buf, buf2, nbytes) != 0)
                                        goto error;
                                ++col;
                        }
                        nbytes = 0;
                        /* If it's UTF8, start collecting again. */
                        if (c != EOF && c >= 0x80)
                                buf[nbytes++] = c;
                } else {
                        /* This is a continuation byte, append to buffer. */
                        if (nbytes == MAX_UTF8_BYTES)
                                goto error;
                        buf[nbytes++] = c;
                }
    
                if (c == EOF)
                        break;
                else if (c == '\n') {
                        ++line;
                        byteoff = 0;
                        col = 1;
                } else
                        ++byteoff;
        }
        
        if (nbytes != 0)
                goto error;

	return 1;
	
error:
	if (!quiet) {
		printf("%s: line %lu, char %lu, byte offset %lu: "
		       "invalid UTF-8 code\n", filename, line, col, byteoff);
	}
	return 0;
}


static void usage(const char *program_name) {
	printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", 
	       program_name);
	printf("Check whether input files are valid UTF-8.\n");
	printf("This is version %s.\n", VERSION);
}


int main(int argc, char **argv) {
	int i, ok;
	FILE *file;

	int quiet;
	struct option options[] = {
		{ "help", no_argument, NULL, 'h' },
		{ "quiet", no_argument, &quiet, 1 },
	};
	int opt;
	
	quiet = 0;
	
	while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
		switch (opt) {
		case 0:
			break;
			
		case 'h':
			usage(argv[0]);
			exit(0);
			break;
			
		case 'q':
			quiet = 1;
			break;

		case '?':
			exit(EXIT_FAILURE);

		default:
			abort();
		}
	}

	if (optind == argc)
		ok = is_utf8_byte_stream(stdin, "stdin", quiet);
	else {
		ok = 1;
		for (i = optind; i < argc; ++i) {
			file = fopen(argv[i], "r");
			if (file == NULL) {
				fprintf(stderr, "isutf8: %s: error %d: %s\n", 
				                argv[i], errno, 
				                strerror(errno));
				ok = 0;
			} else {
			        if (! is_utf8_byte_stream(file, argv[i], quiet))
			            ok = 0;
				(void) fclose(file);
			}
		}
	}
	
	if (ok)
		exit(0);
	exit(EXIT_FAILURE);
}

Mode	Type	Size	Ref	File
100644	blob	17989	b7b5f53df1412df1e117607f18385b39004cdaa2	COPYING
100644	blob	933	b0595c417e33a8115ed8e2f62e50e9762d5cede7	Makefile
100644	blob	1048	ccc257cd3e6190cafffba1b73125677a16beea33	README
100755	blob	687	3abb315c7a9592c44f884cf5817dd062f9804f43	check-isutf8
100755	blob	2607	a695935b24a5f2789c71a8affc4486859a41f737	combine
040000	tree	-	d2320f6e716f8d093815fd86cc881880d24fcc60	debian
100644	blob	12767	f8fa0240cfe7d245c22dbd09d77be08e479ada1d	ifdata.c
100644	blob	7039	5f2837f71cb7ea8e1aa8481d854d975174e19ab9	ifdata.docbook
100644	blob	3006	d8ecea9b8bc416154533572e1ce85a0385b7af10	ifne.c
100644	blob	2360	41fa9abe7a23b63f5afd110dcd0b3f78b0e4c531	ifne.docbook
100644	blob	7296	4306c7d7a22b230db58248e3317ada4a1f60bb34	isutf8.c
100644	blob	2894	f9c9eb59e9e15197e686a25a93d8785e4522696a	isutf8.docbook
100644	blob	5471	4925409bd548b058f07defe913724868801040df	lckdo.c
100644	blob	3261	8a0a4a863aba57a7a4d7b06b69414c25c21dfa17	lckdo.docbook
100644	blob	5777	43ba76aa6f3d24cfe8bb90b3776b044f27df9d37	mispipe.c
100644	blob	2292	b645b2c756f9b79cdde96a4a82c63bd9fd60fbff	mispipe.docbook
100644	blob	1040	6ba38f78da10b61c8670b1c450fa769248ef84c4	pee.c
100644	blob	2082	18c753f289f920c9fddc06b191a2c7a031bbe391	pee.docbook
100644	blob	7301	a53a2cf1906998c91533f5f5435ceeeeb1a7cd59	physmem.c
100644	blob	8551	80733a22387f8290b77434f04a6a5dae099cc6b9	sponge.c
100644	blob	1757	f9395a72eb845f0c8007dd2777501a305243f579	sponge.docbook
100755	blob	2515	ca150b4a36105ee55ca72920d6adce1aafd9a05a	ts
100755	blob	4495	a77739f27d8cab6843471de92857fe5064f9ace4	vidir
100755	blob	1260	8cee2b1775c19a3e7e046984a325eebfc8b94ecf	vipe
100755	blob	2458	cedfbaee3b39348daa81758057c0adbe66374dbd	zrun

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/nicolas/debian.moreutils

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/nicolas/debian.moreutils

Clone this repository using git:

git clone git://git.rocketgit.com/user/nicolas/debian.moreutils

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main