/*  SPLIT.PL
    Shelved on the 21st December 1987
*/

/*
This program  allows one to separate  text files which have  been packed
into a  larger file.  In particular, you  can use  it to  separate files
belonging to the Prolog Library which have been packed in this way.


The main predicate is  'split', of arity zero. It asks  you for the name
of  a composite  text file.  Type this  name  on a  line, terminated  by
RETURN. 'split' then splits that  file into its component subfiles. Each
subfile must have this structure:

    <Header line>
    <Text line> zero or more times
    <Tailer line>

The header line of a subfile is of the form
    <Signal> ^START^ <Filename> ^
where <Signal> is  a sequence of characters which is  unlikely to appear
in any source text. In the program, it is
    )(*&%$#@!
at the beginning  of a line. You  can change it by  altering signal/1 at
the top of  the program. You can  alter the way filenames  are put after
the signal by altering matches_file_start_signal/2 in the program.

The tailer line is
    <Signal> ^END^ <Filename> ^

You can alter it by changing copy_2/4 and matches_file_end_signal/1.


'split' writes the text between the header and tailer of each subfile to
the  file  named in  the  header  line.  The  effect of  I/O  errors  is
undefined.


HERE IS AN EXAMPLE COMPOSITE FILE (indented by four spaces):
    )(*&%$#@!^START^EX.OUT^
    line 1
    line 2
    line 3
    line Final
    )(*&%$#@!^END^EX.OUT^
    )(*&%$#@!^START^LEX.OUT^
    templates

        det(Det)

            argument    is  E1 type entity
            result      is  E2=Det(E1) type entity
            nextlex     is  ( adj | ipn )
            lex         is  self
            semfn       is  specify: E1 -> E2
            awaitc      is  E1 type entity

        ipn(Ipn) *

            if  nextlex \= ipn then
                argument    is  ()
                result      is  E=referent(Ipn) type entity
                nextlex     is  none_expected
                lex         is  self
                semfn       is  referent: E
                awaitc      is  none_expected
            else
                argument    is  E1 type entity
                result      is  E2=ModByNoun(E1) type entity
                nextlex     is  none_expected
                lex         is  self
                semfn       is  specify: E1 -> E2
                awaitc      is  E1 type entity
    )(*&%$#@!^END^LEX.OUT^
    )(*&%$#@!^START^REC.OUT^
    section struct_cons => nonsyntax cons_record;
    define make_record( record_name, nargs ) -> result;
        lvars record_name, nargs, result;
        lvars record_key=key_of_dataword(record_name);
        lvars record_size=class_datasize( record_key ) - 1;
        repeat record_size times
            undef
        endrepeat;
        class_cons( record_key )() -> result;
        lvars field_proc value;
        repeat nargs times
            valof() -> field_proc;
            -> field_proc( result );
        endrepeat;
    enddefine;
    )(*&%$#@!^END^REC.OUT^
THATS THE END OF THE SAMPLE COMPOSITE FILE.


PORTABILITY:

I/O errors are  not trapped, because there is no  standard way to do
so.

It is assumed that characters  are represented as integer codes, and
that (e.g.) the notation "fred" is equivalent to a list of the codes
for those four characters.

The  predicates 'is_newline'/1  and  'is_eof'/1 at  the  top of  the
program  define   which  characters  are  returned   by  'get0'  for
end-of-line and end-of-file.

It is assumed that 'get0' will  return a unique end-of-file code for
the character after the final newline of the final subfile.


OPTIMISATION:

For those whose  Prologs do not optimise  tail-recursion, I've tried
to avoid going  too deep in levels of recursion.  The obvious way to
implement 'copy' (which  copies the file line-by-line)  is to copy a
line, and then re-call 'copy'.

This however causes  some implementations to run out  of local stack
space.  If 'copy'  did  not need  to  keep a  line  number, I  could
implement it as a repeat/fail loop.  However, since it needs to pass
an incremented line  number from call to call, I'd  have to preserve
that number by  asserting and retracting on  each repeat/fail cycle;
this would cause many garbage collections, and would be very slow.

I've compromised  by defining 'copy_2'. This  does a maximum  of 200
self-recursive calls,  and then fails (it  tests the line  number to
see when to fail). As it  fails, it asserts the current line-number;
'copy' detects the  failure, picks up the  line-number, and re-calls
copy_2, with we hope the previous 200 levels of stack space deleted.
The Call argument  to copy_2 is to stop it  failing back into 'copy'
just after it's been re-called.

Finally, I've also  avoided generating atoms. Instead,  lines of the
file   are  kept   as   lists  of   character   codes,  written   by
'writef_ascii'.
*/


/*  PORTABILITY  */


/*  is_newline( C ):

    True for those character codes C which get0 returns at the end-of-line.
*/
is_newline( 10 ).


/*  is_eof( C ):

    True for those character codes C which get0 returns at the end-of-file.
*/
is_eof( 26 ).


/*  signal( S ):

    S is the list of characters signalling the start of a new sub-file.
*/
signal( ")(*&%$#@!" ).


/*  OUTPUT  */


:- op( 40, xfy, <> ).
:- op( 40, xfy, ... ).


/*  writef( V+ ):

    Write V to the COS.
    If V = nl, take a newline.
    If V = A<>B, writef A, then B.
    Treat A...B as A<>' '<>B.
    If V = '$'(L), assume L is a list of ASCII codes, turn into an atom,
     and write that.
    Else, write V as it is.
*/
writef( V ) :-
    var( V ), !, write(V).

writef( A<>B ) :-
    !, writef(A), writef(B).

writef( A...B ) :-
    !, writef(A), writef(' '), writef(B).

writef( '$'(L) ) :-
    !, writef_ascii( L ).

writef( nl ) :-
    !, nl.

writef( X ) :-
    write( X ).


/*  writef_ascii( L+ ):
    Put each character code in list L.
*/
writef_ascii( [] ) :- !.
writef_ascii( [C|T] ) :-
    put( C ),
    writef_ascii( T ), !.


/*  writef_to( File+, Text+ ):

    Do writef(Text), but tell(File) first, and restore the old
    COS after.
*/
writef_to( File, Text ) :-
    telling( COS ),
    tell( File ),
    writef( Text ),
    tell( COS ).


/*  FILE COPYING  */


/*  split_1:

    Read a sequence of sub-files from the CIS, copying each to
    its destination. Stop and succeed on eof.

    This was tail-recursive, but I've made it use a nasty repeat/fail
    loop for those whose Prologs don't re-use space on tail-recursion.                   
*/
split_1 :-
    repeat,
        (
            not(( read_line( S ),
                  do_file( S )
            ))
        ).
split_1.


/*  do_file( Line+ ):

    Line is the next line from the CIS.
    It should signal the start of a new sub-file to be split off.
    If it does, split it, leaving the CIS at EOF or the start of the next
    sub-file.
    Else, give an error.
*/
do_file( Line1 ) :-
    matches_file_start_signal( Line1, Filename ),
    copy_file( Filename ), !.

do_file( Line1 ) :-
    writef_to( user, 'File name not found where expected:'<>nl ),
    writef_to( user, '$'(Line1)<>nl ).


/*  copy_file( Name+ ):

    The next line to be read from CIS is the first line of the contents
    of a subfile (line after the header).
    Copy this subfile to file Name; close Name; leave CIS on header
    line of next subfile; restore old COS.
*/
copy_file( Name ) :-
    writef_to( user, 'Copying file: '<>Name<>nl ),
    telling( COS ),
    tell( Name ),
    copy( call_1, Name, 0, LastLineNo ),
    writef_to( user, 'Copied file: '<>Name<>
                     ' ('<>LastLineNo<>' lines )'<>nl ),
    told,
    tell( COS ),
    fail.
    /*  Force stack to be cleared.  */
copy_file( _ ) :- !.


/*  copy( Name+, LineNo+, FinalLineNo- ):

    Copy from LineNo to the final line of the current subfile.
    These arguments are used only for messages to the user.
*/
copy( Call, Name, LineNo, FinalLineNo ) :-
    copy_2( Call, Name, LineNo, FinalLineNo ).

copy( _, Name, _, FinalLineNo ) :-
    retract( '$copy'(LineNo) ),
    copy( recall, Name, LineNo, FinalLineNo ).


copy_2( call_1, Name, LineNo, FinalLineNo ) :-
    ( LineNo mod 200 ) =:= 199,
    !,
    writef_to(user,fail<>nl),
    asserta( '$copy'(LineNo) ),
    fail.

copy_2( _, Name, LineNo, FinalLineNo ) :-
    read_line( S ),
    !,
    (
        matches_file_end_signal( S ),
        FinalLineNo = LineNo
    ;
        writef_ascii( S ), nl,
        NextLineNo is LineNo + 1,
        copy_2( call_1, Name, NextLineNo, FinalLineNo )
    ), !.

copy_2( _, Name, LineNo, LineNo ) :-
    /*  Here if 'read_line' failed.  */
    writef_to( user, 'End of file on reading '<>Name<>nl ), !.


/*  READING LINES  */


/*  read_line_as_atom( A- ):

    read_line reads the current line into A as an atom
    (not including the terminating newline), and leaves the CIS
    positioned just after the newline.
*/
read_line_as_atom( A ) :-
    read_line( L ),
    name( A, L ).


/*  read_line( L- ):

    read_line reads the current line into L as a list of ASCII codes
    (not including the terminating newline), and leaves the CIS
    positioned just after the newline.

    It fails if the first character of the newline is an end-of-file, but
    the effect of end-of-file in the middle of a line is undefined.
*/
read_line( L ) :-
    get0( C ),
    test_eof_or_read_rest( C, L ).


/*  test_eof_or_read_rest( C+, L- ):

    The CIS is just after a newline.
    If C is an eof, then fail.
    Else read the line into L.
*/
test_eof_or_read_rest( C, L ) :-
    is_eof( C ), !, fail.

test_eof_or_read_rest( C, L ) :-
    read_rest_of_line_as_list( C, L ), !.


/*  read_rest_of_line_as_list( C+, L- ):

    This predicate reads the rest of the current line into list L, putting
    character code C on the front of L.
    It leaves the CIS  positioned after the newline character.
*/
read_rest_of_line_as_list( C, [] ) :-
    is_newline( C ), !.

read_rest_of_line_as_list( C, [] ) :-
    is_eof( C ), !.

read_rest_of_line_as_list( C, [C|Rest] ) :-
    get0( NextC ),
    read_rest_of_line_as_list( NextC, Rest ).


/*  MATCHING FILENAMES  */


/*  matches_file_start_signal( Line+, Name- ):

    If Line is the header of a sub-file, then Name becomes an atom
    giving the name of that sub-files destination.
    Else fail.
*/
matches_file_start_signal( Line, NameAsAtom ) :-
    signal( Signal ),
    can_append( Signal, Rest, Line ), !,
    can_append( "^START^", Rest1, Rest ),
    can_append( Name, "^", Rest1 ),
    name( NameAsAtom, Name ), !.


/*  matches_file_end_signal( Line+ ):

    Succeed if Line is the tailer of a sub-file. It is assumed not
    to be the header.
*/
matches_file_end_signal( Line ) :-
    signal( Signal ),
    can_append( Signal, Rest, Line ), !,
    can_append( "^END^", Rest1, Rest ),
    can_append( Name, "^", Rest1 ), !.


/*  can_append( L1?, L2?, L3+ ):

    Succeeds if L1 appended to L2 gives L3.
*/
can_append( [], L, L ).
can_append( [A|X], Y, [A|Z] ) :-
    can_append( X, Y, Z ).


/*  MAIN PREDICATE  */


/*  split:

    Main predicate.
    Read a filename from the CIS (assumed to be the keyboard).
    Read from the file so named a sequence of sub-files.
    Copy each sub-file to its destination; then close the file and
    restore the old CIS.
*/
split :-
    write( 'Please type the name of the file you want to split' ),nl,
    write( 'and terminate with a RETURN (not with a dot).'),nl,
    read_line_as_atom( File ),
    seeing( CIS ),
    see( File ), seen,
    /*  Ensure the file isnt already open  */
    see( File ),
    split_1,
    seen,
    see( CIS ).