Move under www to ease rsync
This commit is contained in:
6
www/eps/hpr2610/hpr2610_awk12_ex1.awk
Executable file
6
www/eps/hpr2610/hpr2610_awk12_ex1.awk
Executable file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
patsplit($0,a,/[^,]*/)
|
||||
for (i in a)
|
||||
printf "%s ",a[i]
|
||||
print ""
|
||||
}
|
||||
23
www/eps/hpr2610/hpr2610_awk12_ex10.awk
Executable file
23
www/eps/hpr2610/hpr2610_awk12_ex10.awk
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/awk -f
|
||||
|
||||
#
|
||||
# Sort the indices as strings in ascending order
|
||||
#
|
||||
BEGIN{
|
||||
PROCINFO["sorted_in"]="@ind_str_asc"
|
||||
}
|
||||
|
||||
#
|
||||
# Make a frequency table of the first letter of each word
|
||||
#
|
||||
{
|
||||
freq[substr($1,1,1)]++
|
||||
}
|
||||
|
||||
#
|
||||
# Print the results in the frequency table
|
||||
#
|
||||
END{
|
||||
for (i in freq)
|
||||
printf "%s: %d\n",i,freq[i]
|
||||
}
|
||||
6
www/eps/hpr2610/hpr2610_awk12_ex2.awk
Executable file
6
www/eps/hpr2610/hpr2610_awk12_ex2.awk
Executable file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
patsplit($0,a,/([^,]*)|("[^"]+")/)
|
||||
for (i in a)
|
||||
printf "<%s> ",a[i]
|
||||
print ""
|
||||
}
|
||||
9
www/eps/hpr2610/hpr2610_awk12_ex3.awk
Executable file
9
www/eps/hpr2610/hpr2610_awk12_ex3.awk
Executable file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
flds = patsplit($0,a,/[A-Za-z]+/,s)
|
||||
for (i in a)
|
||||
printf "%s ",a[i]
|
||||
print ""
|
||||
for (i=1; i<=flds; i++)
|
||||
printf "%s ",s[i]
|
||||
print ""
|
||||
}
|
||||
8
www/eps/hpr2610/hpr2610_awk12_ex4.awk
Executable file
8
www/eps/hpr2610/hpr2610_awk12_ex4.awk
Executable file
@@ -0,0 +1,8 @@
|
||||
BEGIN{
|
||||
PROCINFO["sorted_in"]="@val_str_asc"
|
||||
}
|
||||
{
|
||||
split($0,a," ")
|
||||
for (i in a)
|
||||
printf "%d: %s\n",i,a[i]
|
||||
}
|
||||
8
www/eps/hpr2610/hpr2610_awk12_ex5.awk
Executable file
8
www/eps/hpr2610/hpr2610_awk12_ex5.awk
Executable file
@@ -0,0 +1,8 @@
|
||||
BEGIN{
|
||||
a[1]="Jones"
|
||||
a[2]="X"
|
||||
a[3]="Smith"
|
||||
asort(a)
|
||||
for (i in a)
|
||||
printf "%s %s\n",i,a[i]
|
||||
}
|
||||
11
www/eps/hpr2610/hpr2610_awk12_ex6.awk
Executable file
11
www/eps/hpr2610/hpr2610_awk12_ex6.awk
Executable file
@@ -0,0 +1,11 @@
|
||||
BEGIN{
|
||||
a["a"]="Jones"
|
||||
a["b"]="X"
|
||||
a["c"]="Smith"
|
||||
asort(a,b)
|
||||
for (i in b)
|
||||
printf "b[%s] = %s\n",i,b[i]
|
||||
print ""
|
||||
for (i in a)
|
||||
printf "a[%s] = %s\n",i,a[i]
|
||||
}
|
||||
8
www/eps/hpr2610/hpr2610_awk12_ex7.awk
Executable file
8
www/eps/hpr2610/hpr2610_awk12_ex7.awk
Executable file
@@ -0,0 +1,8 @@
|
||||
BEGIN{
|
||||
a["third"]="Jones"
|
||||
a["second"]="X"
|
||||
a["first"]="Smith"
|
||||
asorti(a)
|
||||
for (i in a)
|
||||
printf "%s %s\n",i,a[i]
|
||||
}
|
||||
20
www/eps/hpr2610/hpr2610_awk12_ex8.awk
Executable file
20
www/eps/hpr2610/hpr2610_awk12_ex8.awk
Executable file
@@ -0,0 +1,20 @@
|
||||
BEGIN{
|
||||
a["third"]="Jones"
|
||||
a["second"]="X"
|
||||
a["first"]="Smith"
|
||||
asorti(a,b)
|
||||
|
||||
print "What array a contains:"
|
||||
for (i in a)
|
||||
printf "a[%s] = %s\n",i,a[i]
|
||||
print ""
|
||||
|
||||
print "What array b contains:"
|
||||
for (i in b)
|
||||
printf "b[%s] = %s\n",i,b[i]
|
||||
print ""
|
||||
|
||||
print "Accessing original array a with sorted indices in b"
|
||||
for (i in b)
|
||||
printf "%6s: %s\n",b[i],a[b[i]]
|
||||
}
|
||||
8
www/eps/hpr2610/hpr2610_awk12_ex9.awk
Executable file
8
www/eps/hpr2610/hpr2610_awk12_ex9.awk
Executable file
@@ -0,0 +1,8 @@
|
||||
BEGIN{
|
||||
a["a"]="Jones"
|
||||
a["b"]="X"
|
||||
a["c"]="Smith"
|
||||
asort(a,b,"@val_str_desc")
|
||||
for (i in b)
|
||||
printf "%s %s\n",i,b[i]
|
||||
}
|
||||
14
www/eps/hpr2610/hpr2610_awk12_extra.awk
Executable file
14
www/eps/hpr2610/hpr2610_awk12_extra.awk
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/awk -f
|
||||
#
|
||||
# Awk script to take a sequence of words separated by spaces and turn them
|
||||
# into a string where each word is followed by as many hyphens as there are
|
||||
# letters in the word itself.
|
||||
#
|
||||
{
|
||||
for (i=1; i<=NF; i++){
|
||||
fill=$i
|
||||
gsub(/./,"-",fill)
|
||||
printf "%s%s",$i,fill
|
||||
}
|
||||
print ""
|
||||
}
|
||||
BIN
www/eps/hpr2610/hpr2610_full_shownotes.epub
Executable file
BIN
www/eps/hpr2610/hpr2610_full_shownotes.epub
Executable file
Binary file not shown.
518
www/eps/hpr2610/hpr2610_full_shownotes.html
Executable file
518
www/eps/hpr2610/hpr2610_full_shownotes.html
Executable file
@@ -0,0 +1,518 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="generator" content="pandoc">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||||
<meta name="author" content="Dave Morriss">
|
||||
<title>Gnu Awk - Part 12 (HPR Show 2610)</title>
|
||||
<style type="text/css">code{white-space: pre;}</style>
|
||||
<!--[if lt IE 9]>
|
||||
<script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
|
||||
<![endif]-->
|
||||
<style type="text/css">
|
||||
a.sourceLine { display: inline-block; line-height: 1.25; }
|
||||
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
|
||||
a.sourceLine:empty { height: 1.2em; position: absolute; }
|
||||
.sourceCode { overflow: visible; }
|
||||
code.sourceCode { white-space: pre; position: relative; }
|
||||
div.sourceCode { margin: 1em 0; }
|
||||
pre.sourceCode { margin: 0; }
|
||||
@media screen {
|
||||
div.sourceCode { overflow: auto; }
|
||||
}
|
||||
@media print {
|
||||
code.sourceCode { white-space: pre-wrap; }
|
||||
a.sourceLine { text-indent: -1em; padding-left: 1em; }
|
||||
}
|
||||
pre.numberSource a.sourceLine
|
||||
{ position: relative; }
|
||||
pre.numberSource a.sourceLine:empty
|
||||
{ position: absolute; }
|
||||
pre.numberSource a.sourceLine::before
|
||||
{ content: attr(data-line-number);
|
||||
position: absolute; left: -5em; text-align: right; vertical-align: baseline;
|
||||
border: none; pointer-events: all;
|
||||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||||
-khtml-user-select: none; -moz-user-select: none;
|
||||
-ms-user-select: none; user-select: none;
|
||||
padding: 0 4px; width: 4em;
|
||||
color: #aaaaaa;
|
||||
}
|
||||
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
|
||||
div.sourceCode
|
||||
{ }
|
||||
@media screen {
|
||||
a.sourceLine::before { text-decoration: underline; }
|
||||
}
|
||||
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
|
||||
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
|
||||
code span.at { color: #7d9029; } /* Attribute */
|
||||
code span.bn { color: #40a070; } /* BaseN */
|
||||
code span.bu { } /* BuiltIn */
|
||||
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
|
||||
code span.ch { color: #4070a0; } /* Char */
|
||||
code span.cn { color: #880000; } /* Constant */
|
||||
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
|
||||
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
|
||||
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
|
||||
code span.dt { color: #902000; } /* DataType */
|
||||
code span.dv { color: #40a070; } /* DecVal */
|
||||
code span.er { color: #ff0000; font-weight: bold; } /* Error */
|
||||
code span.ex { } /* Extension */
|
||||
code span.fl { color: #40a070; } /* Float */
|
||||
code span.fu { color: #06287e; } /* Function */
|
||||
code span.im { } /* Import */
|
||||
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
|
||||
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
|
||||
code span.op { color: #666666; } /* Operator */
|
||||
code span.ot { color: #007020; } /* Other */
|
||||
code span.pp { color: #bc7a00; } /* Preprocessor */
|
||||
code span.sc { color: #4070a0; } /* SpecialChar */
|
||||
code span.ss { color: #bb6688; } /* SpecialString */
|
||||
code span.st { color: #4070a0; } /* String */
|
||||
code span.va { color: #19177c; } /* Variable */
|
||||
code span.vs { color: #4070a0; } /* VerbatimString */
|
||||
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
|
||||
</style>
|
||||
<link rel="stylesheet" href="http://hackerpublicradio.org/css/hpr.css">
|
||||
</head>
|
||||
|
||||
<body id="home">
|
||||
<div id="container" class="shadow">
|
||||
<header>
|
||||
<h1 class="title">Gnu Awk - Part 12 (HPR Show 2610)</h1>
|
||||
<h2 class="author">Dave Morriss</h2>
|
||||
<hr/>
|
||||
</header>
|
||||
|
||||
<main id="maincontent">
|
||||
<article>
|
||||
<header>
|
||||
<h1>Table of Contents</h1>
|
||||
<nav id="TOC">
|
||||
<ul>
|
||||
<li><a href="#introduction">Introduction</a></li>
|
||||
<li><a href="#more-about-arrays-in-awk">More about arrays in Awk</a><ul>
|
||||
<li><a href="#using-patsplit">Using <code>patsplit</code></a><ul>
|
||||
<li><a href="#examples">Examples</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#sorting-arrays">Sorting arrays</a><ul>
|
||||
<li><a href="#using-procinfo">Using <code>PROCINFO</code></a></li>
|
||||
<li><a href="#using-awks-array-sorting-functions">Using Awk’s Array Sorting Functions</a></li>
|
||||
<li><a href="#extra-example">Extra example</a></li>
|
||||
</ul></li>
|
||||
</ul></li>
|
||||
<li><a href="#yet-more-about-arrays">Yet more about arrays</a></li>
|
||||
<li><a href="#real-world-awk-example">Real-world Awk example</a></li>
|
||||
<li><a href="#links">Links</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
</header>
|
||||
<h2 id="introduction">Introduction</h2>
|
||||
<p>This is the twelfth episode of the “<a href="http://hackerpublicradio.org/series.php?id=94" title="Learning Awk">Learning Awk</a>” series which is being produced by <a href="http://hackerpublicradio.org/correspondents.php?hostid=300" title="Mr. Young">Mr. Young</a> and myself.</p>
|
||||
<p>In this episode I want to continue with the subject I started in episode 10, an advanced-level look at arrays in Awk.</p>
|
||||
<p>In case it might be of interest I have also included a section describing a recent use I made of <code>awk</code> to solve a problem, though this does not use arrays.</p>
|
||||
<h2 id="more-about-arrays-in-awk">More about arrays in Awk</h2>
|
||||
<h3 id="using-patsplit">Using <code>patsplit</code></h3>
|
||||
<p>We saw the <code>split</code> function in episode 10, but there is also a more powerful function for splitting strings into array elements called <code>patsplit</code>.</p>
|
||||
<dl>
|
||||
<dt><em>patsplit(<strong>string</strong>, <strong>array</strong> [, <strong>fieldpat</strong> [, <strong>seps</strong> ] ])</em></dt>
|
||||
<dd><p>Divide <em>string</em> into pieces defined by <em>fieldpat</em> and store the pieces in <em>array</em> and the separator strings in the <em>seps</em> array.<br />
|
||||
This is the same as <em>split</em> in <a href="http://hackerpublicradio.org/eps/hpr2526" title="Gnu Awk - Part 10">episode 10</a>; consult this episode for the details of this type of string splitting. The main difference from <em>split</em> is that the third argument, <em>fieldpat</em>, is a regular expression which defines the field rather than the separator.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<h4 id="examples">Examples</h4>
|
||||
<p><b>1.</b> Using <code>patsplit</code> to split a comma-delimited string. This could just as well have been done by setting the <code>FS</code> variable and using <code>awk</code>’s standard splitting mechanism (or <code>FPAT</code> which has not been covered in this series so far):</p>
|
||||
<pre><code>$ cat awk12_ex1.awk
|
||||
{
|
||||
patsplit($0,a,/[^,]*/)
|
||||
for (i in a)
|
||||
printf "%s ",a[i]
|
||||
print ""
|
||||
}
|
||||
$ x="An apple a day keeps the doctor away"
|
||||
$ echo "${x// /,}"
|
||||
An,apple,a,day,keeps,the,doctor,away
|
||||
$ echo "${x// /,}" | awk -f awk12_ex1.awk
|
||||
An apple a day keeps the doctor away</code></pre>
|
||||
<p>Note that the <em>fieldpat</em> argument is not the delimiter, but a definition of the field structure itself. Here the <em>regexp</em> specifies a sequence of zero or more characters which are <em>not commas</em>.</p>
|
||||
<p>Note also that Bash variable <code>'x'</code> is set to a string, then this is edited to replace spaces by commas and fed to the <code>awk</code> script - which removes them again!</p>
|
||||
<p><b>2.</b> Another example using a more complex regular expression:</p>
|
||||
<pre><code>$ cat awk12_ex2.awk
|
||||
{
|
||||
patsplit($0,a,/([^,]*)|("[^"]+")/)
|
||||
for (i in a)
|
||||
printf "<%s> ",a[i]
|
||||
print ""
|
||||
}
|
||||
$ echo "A,\"red bird\",in,the,hand,is,worth,two,in,the,bush" | awk -f awk12_ex2.awk
|
||||
<A> <"red bird"> <in> <the> <hand> <is> <worth> <two> <in> <the> <bush></code></pre>
|
||||
<p>This <em>regexp</em> handles data which is more like the standard CSV format:</p>
|
||||
<pre><code>([^,]*)|("[^"]+")</code></pre>
|
||||
<ul>
|
||||
<li>The first sub-expression deals with a series of zero or more <em>not commas</em>.</li>
|
||||
<li>The second one looks for a double-quoted string containing one or more <em>not double quote characters</em>. The CSV standard requires elements with embedded spaces to be quoted.</li>
|
||||
</ul>
|
||||
<p><b>3.</b> Showing what happens to the separators:</p>
|
||||
<pre><code>$ cat awk12_ex3.awk
|
||||
{
|
||||
flds = patsplit($0,a,/[A-Za-z]+/,s)
|
||||
for (i in a)
|
||||
printf "%s ",a[i]
|
||||
print ""
|
||||
for (i=1; i<=flds; i++)
|
||||
printf "%s ",s[i]
|
||||
print ""
|
||||
}
|
||||
$ echo "Grinning--------like----a-Cheshire--------cat---" | awk -f awk12_ex3.awk
|
||||
Grinning like a Cheshire cat
|
||||
-------- ---- - -------- ---</code></pre>
|
||||
<p>In this example the number of fields is stored in <code>flds</code>. The <em>regexp</em> used to define the fields is a sequence of one or more letters. These are printed in a loop as before.</p>
|
||||
<p>The separators are printed in a loop which counts from 1 to the number of fields, and these elements are shown. There is also an element zero because <code>patsplit</code> saves the separator which precedes the first field, but this is empty and we don’t print it here.</p>
|
||||
<h5 id="skip-unless-really-interested">Skip unless <em>really</em> interested</h5>
|
||||
<p><small> The data sent to this example was generated by an <code>awk</code> script which is shown below and is available in the downloadable file <a href="hpr2610_awk12_extra.awk">awk12_extra.awk</a>. Note that this one has been made into a standalone script by the addition of the <code>#!</code> line at the start (and has been made executable):</p>
|
||||
<pre><code>$ cat awk12_extra.awk
|
||||
#!/usr/bin/awk -f
|
||||
#
|
||||
# Awk script to take a sequence of words separated by spaces and turn them
|
||||
# into a string where each word is followed by as many hyphens as there are
|
||||
# letters in the word itself.
|
||||
#
|
||||
{
|
||||
for (i=1; i<=NF; i++){
|
||||
fill=$i
|
||||
gsub(/./,"-",fill)
|
||||
printf "%s%s",$i,fill
|
||||
}
|
||||
print ""
|
||||
}
|
||||
$ echo "Grinning like a Cheshire cat" | ./awk12_extra.awk
|
||||
Grinning--------like----a-Cheshire--------cat---</code></pre>
|
||||
<p></small></p>
|
||||
<h3 id="sorting-arrays">Sorting arrays</h3>
|
||||
<h4 id="using-procinfo">Using <code>PROCINFO</code></h4>
|
||||
<p>In standard <code>awk</code>, the order in which the elements of an array are returned is not defined and it’s necessary to go to some trouble to order them in a specific way.</p>
|
||||
<p>Gnu Awk (<code>gawk</code>) lets you control the order in which the array elements are returned by use of a special built-in array called <code>PROCINFO</code>.</p>
|
||||
<p>Setting <code>PROCINFO["sorted_in"]</code> to one of a set of predefined values allows array sorting. The values are:</p>
|
||||
<p><small></p>
|
||||
<table>
|
||||
<colgroup>
|
||||
<col style="width: 6%" />
|
||||
<col style="width: 93%" />
|
||||
</colgroup>
|
||||
<thead>
|
||||
<tr class="header">
|
||||
<th style="text-align: left;">Value</th>
|
||||
<th style="text-align: left;">Effect</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><code>"@unsorted"</code></td>
|
||||
<td style="text-align: left;">Array elements are unsorted as in standard <code>awk</code></td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><code>"@ind_str_asc"</code></td>
|
||||
<td style="text-align: left;">Order by indices in ascending order compared as strings</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><code>"@ind_str_desc"</code></td>
|
||||
<td style="text-align: left;">Order by indices in descending order compared as strings</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><code>"@ind_num_asc"</code></td>
|
||||
<td style="text-align: left;">Order by indices in ascending order forcing them to be treated as numbers</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><code>"@ind_num_desc"</code></td>
|
||||
<td style="text-align: left;">Order by indices in descending order forcing them to be treated as numbers</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><code>"@val_type_asc"</code></td>
|
||||
<td style="text-align: left;">Order by element values in ascending order. Ordering is by the type assigned to the element</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><code>"@val_type_desc"</code></td>
|
||||
<td style="text-align: left;">Order by element values in descending order. Ordering is by the type assigned to the element</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><code>"@val_str_asc"</code></td>
|
||||
<td style="text-align: left;">Order by element values in ascending order. Scalar values are compared as strings.</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><code>"@val_str_desc"</code></td>
|
||||
<td style="text-align: left;">Order by element values in descending order. Scalar values are compared as strings.</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><code>"@val_num_asc"</code></td>
|
||||
<td style="text-align: left;">Order by element values in ascending order. Scalar values are compared as numbers.</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><code>"@val_num_desc"</code></td>
|
||||
<td style="text-align: left;">Order by element values in descending order. Scalar values are compared as numbers.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p></small></p>
|
||||
<p>Caveats:</p>
|
||||
<ul>
|
||||
<li>The sort order is determined before the loop begins and cannot be changed inside it.</li>
|
||||
<li>The value of <code>PROCINFO["sorted_in"]</code> is effective throughout the script and affects <strong>all</strong> array-scanning loops; it is not localised.</li>
|
||||
</ul>
|
||||
<p>This feature of GNU Awk is more complicated than has been described here. For example, arrays can be more complex than we have seen so far, and <code>PROCINFO["sorted_in"]</code> can also be used to call a user-defined function for sorting. The full details are available in the GNU Awk Manual, starting with <a href="https://www.gnu.org/software/gawk/manual/gawk.html#Controlling-Scanning" title="Using Predefined Array Scanning Orders with gawk">section 8.1.6</a>.</p>
|
||||
<h5 id="examples-1">Examples</h5>
|
||||
<p><b>1.</b> Sorting an array by its values:</p>
|
||||
<pre><code>$ cat awk12_ex4.awk
|
||||
BEGIN{
|
||||
PROCINFO["sorted_in"]="@val_str_asc"
|
||||
}
|
||||
{
|
||||
split($0,a," ")
|
||||
for (i in a)
|
||||
printf "%d: %s\n",i,a[i]
|
||||
}
|
||||
$ echo "An Englishman's home is his castle" | awk -f awk12_ex4.awk
|
||||
1: An
|
||||
2: Englishman's
|
||||
6: castle
|
||||
5: his
|
||||
3: home
|
||||
4: is</code></pre>
|
||||
<p>Here the array is populated using <em>split</em>. The setting of <code>PROCINFO["sorted_in"]</code> has requested sorting by element values in ascending order (in the <code>BEGIN</code> rule). The array is printed showing the indices and values and you can see that the order is as requested. Note that the words with capitals sort before the lowercase ones.</p>
|
||||
<p><b>Addendum:</b> I have included another example of the use of <code>PROCINFO</code> <a href="#extra-example">later in the notes</a>. Since the audio has already been recorded I have named the example awk12_ex10.awk to avoid changing other file names.</p>
|
||||
<h4 id="using-awks-array-sorting-functions">Using Awk’s Array Sorting Functions</h4>
|
||||
<p>As mentioned in episode 11, there are two functions for sorting arrays in GNU Awk: <code>asort</code> and <code>asorti</code>.</p>
|
||||
<dl>
|
||||
<dt><em>asort(<strong>source</strong> [, <strong>dest</strong> [, <strong>how</strong> ] ])</em></dt>
|
||||
<dd><p>Returns the number of elements in the array <em>source</em>.<br />
|
||||
Sorts the values of <em>source</em> and replaces the indices of the sorted values of <em>source</em> with sequential integers starting with one.<br />
|
||||
If the optional array <em>dest</em> is specified, then <em>source</em> is duplicated into <em>dest</em>. <em>dest</em> is then sorted, leaving the array <em>source</em> unchanged.<br />
|
||||
The third argument <em>how</em> specifies how the array is to be sorted.</p>
|
||||
</dd>
|
||||
<dt><em>asorti(<strong>source</strong> [, <strong>dest</strong> [, <strong>how</strong> ] ])</em></dt>
|
||||
<dd><p>Returns the number of elements in the array <em>source</em>.<br />
|
||||
Sorts the indices of <em>source</em> instead of the values.<br />
|
||||
If the optional array <em>dest</em> is specified, then <em>source</em> is duplicated into <em>dest</em>. <em>dest</em> is then sorted, leaving the array <em>source</em> unchanged.<br />
|
||||
The third argument <em>how</em> specifies how the array is to be sorted.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<p>In both cases the optional <em>how</em> argument defines the type of sorting. This must be one of the strings already defined: <code>"@ind_str_asc"</code> to <code>"@val_num_desc"</code>. It can also be, as mentioned above, the name of a user-defined function. We have not looked at user-defined functions yet, so we will leave this option for the moment.</p>
|
||||
<h5 id="examples-2">Examples</h5>
|
||||
<p><b>1.</b> Sorting an array with numeric indexes with <code>asort</code> reorders the indices:</p>
|
||||
<pre><code>$ cat awk12_ex5.awk
|
||||
BEGIN{
|
||||
a[1]="Jones"
|
||||
a[2]="X"
|
||||
a[3]="Smith"
|
||||
asort(a)
|
||||
for (i in a)
|
||||
printf "%s %s\n",i,a[i]
|
||||
}
|
||||
$ awk -f awk12_ex5.awk
|
||||
1 Jones
|
||||
2 Smith
|
||||
3 X</code></pre>
|
||||
<p>Note that the indices have been <u>destroyed</u> and replaced with 1, 2 and 3, in this case in a different order from their original values.</p>
|
||||
<p><b>2.</b> Sorting an array with character indices using <code>asort</code>, showing that providing a destination array is a way to avoid affecting the original:</p>
|
||||
<pre><code>$ cat awk12_ex6.awk
|
||||
BEGIN{
|
||||
a["a"]="Jones"
|
||||
a["b"]="X"
|
||||
a["c"]="Smith"
|
||||
asort(a,b)
|
||||
for (i in b)
|
||||
printf "b[%s] = %s\n",i,b[i]
|
||||
print ""
|
||||
for (i in a)
|
||||
printf "a[%s] = %s\n",i,a[i]
|
||||
}
|
||||
$ awk -f awk12_ex6.awk
|
||||
b[1] = Jones
|
||||
b[2] = Smith
|
||||
b[3] = X
|
||||
|
||||
a[a] = Jones
|
||||
a[b] = X
|
||||
a[c] = Smith</code></pre>
|
||||
<p>This again shows the sorted array <code>'b'</code> has had its indices replaced by the numbers 1, 2 and 3, so if these were important it might be a problem.</p>
|
||||
<p><b>3.</b> Sorting an array with string indices using <code>asorti</code> rebuilds the array with just the indexes, which is usually not useful on its own:</p>
|
||||
<pre><code>$ cat awk12_ex7.awk
|
||||
BEGIN{
|
||||
a["third"]="Jones"
|
||||
a["second"]="X"
|
||||
a["first"]="Smith"
|
||||
asorti(a)
|
||||
for (i in a)
|
||||
printf "%s %s\n",i,a[i]
|
||||
}
|
||||
$ awk -f awk12_ex7.awk
|
||||
1 first
|
||||
2 second
|
||||
3 third</code></pre>
|
||||
<p>In this case the contents of the array <code>'a'</code> have been destroyed, making the indices the contents and adding numeric indices.</p>
|
||||
<p><b>4.</b> Sorting an array with string indices using <code>asorti</code> but using the <em>dest</em> argument results in an array that can be used to access the original array in sorted order without changing it:</p>
|
||||
<pre><code>$ cat awk12_ex8.awk
|
||||
BEGIN{
|
||||
a["third"]="Jones"
|
||||
a["second"]="X"
|
||||
a["first"]="Smith"
|
||||
asorti(a,b)
|
||||
|
||||
print "What array a contains:"
|
||||
for (i in a)
|
||||
printf "a[%s] = %s\n",i,a[i]
|
||||
print ""
|
||||
|
||||
print "What array b contains:"
|
||||
for (i in b)
|
||||
printf "b[%s] = %s\n",i,b[i]
|
||||
print ""
|
||||
|
||||
print "Accessing original array a with sorted indices in b"
|
||||
for (i in b)
|
||||
printf "%6s: %s\n",b[i],a[b[i]]
|
||||
}
|
||||
$ awk -f awk12_ex8.awk
|
||||
What array a contains:
|
||||
a[first] = Smith
|
||||
a[third] = Jones
|
||||
a[second] = X
|
||||
|
||||
What array b contains:
|
||||
b[1] = first
|
||||
b[2] = second
|
||||
b[3] = third
|
||||
|
||||
Accessing original array a with sorted indices in b
|
||||
first: Smith
|
||||
second: X
|
||||
third: Jones</code></pre>
|
||||
<p><b>Note:</b> Since the audio explanation of this example was a bit vague I have enhanced the example to (hopefully) make it more understandable.</p>
|
||||
<p><b>5.</b> Sorting an array with character indices using <code>asort</code> but requesting a sort type <code>"@val_str_desc"</code> - descending order of element values:</p>
|
||||
<pre><code>$ cat awk12_ex9.awk
|
||||
BEGIN{
|
||||
a["a"]="Jones"
|
||||
a["b"]="X"
|
||||
a["c"]="Smith"
|
||||
asort(a,b,"@val_str_desc")
|
||||
for (i in b)
|
||||
printf "%s %s\n",i,b[i]
|
||||
}
|
||||
$ awk -f awk12_ex9.awk
|
||||
1 X
|
||||
2 Smith
|
||||
3 Jones</code></pre>
|
||||
<h4 id="extra-example">Extra example</h4>
|
||||
<p><b>1.</b> Another <code>PROCINFO</code> example which counts the initial letters of words in a dictionary:</p>
|
||||
<pre><code>$ cat awk12_ex10.awk
|
||||
#!/usr/bin/awk -f
|
||||
|
||||
#
|
||||
# Sort the indices as strings in ascending order
|
||||
#
|
||||
BEGIN{
|
||||
PROCINFO["sorted_in"]="@ind_str_asc"
|
||||
}
|
||||
|
||||
#
|
||||
# Make a frequency table of the first letter of each word
|
||||
#
|
||||
{
|
||||
freq[substr($1,1,1)]++
|
||||
}
|
||||
|
||||
#
|
||||
# Print the results in the frequency table
|
||||
#
|
||||
END{
|
||||
for (i in freq)
|
||||
printf "%s: %d\n",i,freq[i]
|
||||
}
|
||||
$ ./awk12_ex10.awk /usr/share/dict/words
|
||||
A: 1412
|
||||
B: 1462
|
||||
C: 1592
|
||||
D: 828
|
||||
E: 641
|
||||
F: 529
|
||||
G: 834
|
||||
H: 916
|
||||
I: 350
|
||||
J: 558
|
||||
K: 659
|
||||
...</code></pre>
|
||||
<p>In this example I have made the script executable and have added a <em>hash bang</em> line to define it as an Awk script. Don’t forget the <code>'-f'</code> at the end of that extra line.</p>
|
||||
<p>In this example the dictionary file <code>/usr/share/dict/words</code> is scanned. Each line contains a single word and the script takes the first letter of this word and uses it as an index to the array <code>freq</code>. This element is incremented by 1 resulting in the accumulation of the frequencies of these initial letters. The frequency table is printed in the <code>END</code> rule but because a sort order has been defined in the <code>BEGIN</code> rule the elements appear in ascending order of the index.</p>
|
||||
<h2 id="yet-more-about-arrays">Yet more about arrays</h2>
|
||||
<p>There is more to be said about arrays in Gnu Awk. It is possible to have multi-dimensional arrays (of a sort) and to have arrays as array elements too (a GNU extension).</p>
|
||||
<p>We probably will not be covering these further topics in this series, though there is plenty of information in the <a href="https://www.gnu.org/software/gawk/manual/html_node/index.html" title="GNU Awk User's Guide">GNU Awk manual</a> if you want to dig deeper.</p>
|
||||
<p>Of course, if we receive a request to cover this area in more depth then we will reconsider!</p>
|
||||
<hr />
|
||||
<h2 id="real-world-awk-example">Real-world Awk example</h2>
|
||||
<p>One of the things I do for HPR is to process the show notes sent in with episodes, many of which are plain text. Since we need HTML for loading into the HPR database I run these through an editor and a series of scripts to turn them into Markdown, and then generate HTML from them. I do this on my workstation after grabbing a copy of the notes from the HPR server.</p>
|
||||
<p>In order to check that the generated HTML looks OK I make a local copy of it, which can be viewed with a browser, and I use a tool called <code>pandoc</code> to make this version. This tool turns Markdown into HTML (amongst other document conversion tasks), but lately some of its requirements have changed necessitating a change to my workflow.</p>
|
||||
<p>To make the HTML copy I want for local viewing <code>pandoc</code> needs some additional information. The information takes the form of two delimited lines in YAML format, such as:</p>
|
||||
<div class="sourceCode" id="cb13"><pre class="sourceCode yaml"><code class="sourceCode yaml"><a class="sourceLine" id="cb13-1" data-line-number="1"><span class="ot">---</span></a>
|
||||
<a class="sourceLine" id="cb13-2" data-line-number="2"><span class="fu">title:</span><span class="at"> </span><span class="st">'Title of show'</span></a>
|
||||
<a class="sourceLine" id="cb13-3" data-line-number="3"><span class="fu">author:</span><span class="at"> </span><span class="st">'host name'</span></a>
|
||||
<a class="sourceLine" id="cb13-4" data-line-number="4"><span class="co">...</span></a></code></pre></div>
|
||||
<p>This metadata is used to generate headers in the final document.</p>
|
||||
<p>To generate this I added the following <code>awk</code> script to the Bash script I wrote that runs <code>pandoc</code>:</p>
|
||||
<div class="sourceCode" id="cb14"><pre class="sourceCode numberSource awk numberLines"><code class="sourceCode awk"><a class="sourceLine" id="cb14-1" data-line-number="1">awk <span class="op">-</span>f <span class="op">-</span> <span class="st">"$RAWFILE"</span> <span class="op">></span> <span class="st">"$TMP1"</span> <span class="op"><<</span>'ENDAWK'</a>
|
||||
<a class="sourceLine" id="cb14-2" data-line-number="2"><span class="cf">BEGIN</span> <span class="kw">{print</span> <span class="st">"---"</span><span class="kw">}</span></a>
|
||||
<a class="sourceLine" id="cb14-3" data-line-number="3"><span class="ot">/^</span><span class="ss">Title:</span><span class="ot">/</span> <span class="kw">{</span></a>
|
||||
<a class="sourceLine" id="cb14-4" data-line-number="4"> <span class="fu">sub</span>(<span class="op">/^</span>Title<span class="op">:</span>\s<span class="op">/,</span><span class="st">""</span>)</a>
|
||||
<a class="sourceLine" id="cb14-5" data-line-number="5"> <span class="fu">gsub</span>(<span class="op">/</span>'<span class="op">/,</span><span class="st">"''"</span>)</a>
|
||||
<a class="sourceLine" id="cb14-6" data-line-number="6"> <span class="kw">printf</span> <span class="st">"title: '%s'</span><span class="sc">\n</span><span class="st">"</span><span class="op">,</span><span class="dt">$0</span></a>
|
||||
<a class="sourceLine" id="cb14-7" data-line-number="7"><span class="kw">}</span></a>
|
||||
<a class="sourceLine" id="cb14-8" data-line-number="8"><span class="ot">/^</span><span class="ss">Host_Name:</span><span class="ot">/</span> <span class="kw">{</span></a>
|
||||
<a class="sourceLine" id="cb14-9" data-line-number="9"> <span class="fu">sub</span>(<span class="op">/^</span>Host_Name<span class="op">:</span>\s<span class="op">/,</span><span class="st">""</span>)</a>
|
||||
<a class="sourceLine" id="cb14-10" data-line-number="10"> <span class="fu">gsub</span>(<span class="op">/</span>'<span class="op">/,</span><span class="st">"''"</span>)</a>
|
||||
<a class="sourceLine" id="cb14-11" data-line-number="11"> <span class="kw">printf</span> <span class="st">"author: '%s'</span><span class="sc">\n</span><span class="st">"</span><span class="op">,</span><span class="dt">$0</span></a>
|
||||
<a class="sourceLine" id="cb14-12" data-line-number="12"><span class="kw">}</span></a>
|
||||
<a class="sourceLine" id="cb14-13" data-line-number="13"><span class="cf">END</span> <span class="kw">{print</span> <span class="st">"..."</span><span class="kw">}</span></a>
|
||||
<a class="sourceLine" id="cb14-14" data-line-number="14">ENDAWK</a></code></pre></div>
|
||||
<p>The first line is the invocation of <code>awk</code>. Note that the argument to the <code>-f</code> option is <code>'-'</code>, which means the standard input channel. This is catered for by the Bash <em>heredoc</em> which is everything from <code>"<<'ENDAWK'"</code> to the last line in the example. This is Bash’s way of embedding data in a script without having to put it in a string and risk all the issues that can ensue with string delimiters.</p>
|
||||
<p>The character string (<code>ENDAWK</code>) used in the <em>heredoc</em> to enclose the information to be offered to <code>awk</code> on standard input is chosen by the user, but it must be unique within the Bash script. Enclosing the first instance in single quotes turns off the Bash parameter substitution within the enclosed document - so <code>'$0'</code> in this example would have been seen and interpreted by Bash as a shell variable if this had not been done.</p>
|
||||
<p>The data file being processed by <code>awk</code> is a file containing the output of the show submission form, the name of which is in the <code>RAWFILE</code> variable. The output from <code>awk</code> is written to a temporary file, the name of which is in the variable <code>TMP1</code>.</p>
|
||||
<p>The <code>awk</code> script itself writes the necessary three hyphens in the <code>BEGIN</code> rule (line 2) and the final three fullstops in the <code>END</code> rule (line 13).</p>
|
||||
<p>There are two regular expression matching rules. One matches <code>^Title:</code> which precedes the show title in the input file. The other matches <code>^Host_Name:</code> which labels the line containing the name of the host.</p>
|
||||
<p>In both cases these labels, with the trailing white space (often a <em>Tab</em>) are deleted using the <code>sub</code> function (lines 4 and 9).</p>
|
||||
<p>Because the resulting strings might contain quotes, a <code>gsub</code> call is used to ensure that any quotes are doubled using <code>gsub</code> (lines 5 and 10).</p>
|
||||
<p>Finally the two strings are written out with the required labels for <code>pandoc</code>, using single quotes to enclose each of them (lines 6 and 11).</p>
|
||||
<p>The resulting file of YAML-format metadata is read by <code>pandoc</code> before the file of notes for the show.</p>
|
||||
<div class="sourceCode" id="cb15"><pre class="sourceCode bash"><code class="sourceCode bash"><a class="sourceLine" id="cb15-1" data-line-number="1"><span class="ex">pandoc</span> -f <span class="va">${FROM}</span>+smart -t html5 \</a>
|
||||
<a class="sourceLine" id="cb15-2" data-line-number="2"> --standalone --template=hpr.html5 \</a>
|
||||
<a class="sourceLine" id="cb15-3" data-line-number="3"> -c http://hackerpublicradio.org/css/hpr.css \</a>
|
||||
<a class="sourceLine" id="cb15-4" data-line-number="4"> <span class="st">"</span><span class="va">$TMP1</span><span class="st">"</span> <span class="st">"</span><span class="va">$EXTRACT</span><span class="st">"</span> -o <span class="st">"</span><span class="va">$FULLHTML</span><span class="st">"</span> <span class="va">$EXTRAS</span></a></code></pre></div>
|
||||
<p>Note that the viewable HTML file created here uses the HPR CSS so that it looks just as it will when the show is released.</p>
|
||||
<p>This is not a very complex Awk script, but I thought it might be of interest, especially given that a Bash <em>heredoc</em> is being used.</p>
|
||||
<h2 id="links">Links</h2>
|
||||
<ul>
|
||||
<li><a href="https://www.gnu.org/software/gawk/manual/html_node/index.html"><em>GNU Awk User’s Guide</em></a></li>
|
||||
<li>Previous shows in this series on HPR:
|
||||
<ul>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2114">“<em>Gnu Awk - Part 1</em>”</a> - episode 2114</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2129">“<em>Gnu Awk - Part 2</em>”</a> - episode 2129</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2143">“<em>Gnu Awk - Part 3</em>”</a> - episode 2143</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2163">“<em>Gnu Awk - Part 4</em>”</a> - episode 2163</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2184">“<em>Gnu Awk - Part 5</em>”</a> - episode 2184</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2238">“<em>Gnu Awk - Part 6</em>”</a> - episode 2238</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2330">“<em>Gnu Awk - Part 7</em>”</a> - episode 2330</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2438">“<em>Gnu Awk - Part 8</em>”</a> - episode 2438</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2476">“<em>Gnu Awk - Part 9</em>”</a> - episode 2476</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2526">“<em>Gnu Awk - Part 10</em>”</a> - episode 2526</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps/hpr2554">“<em>Gnu Awk - Part 11</em>”</a> - episode 2554</li>
|
||||
</ul></li>
|
||||
<li>Resources:
|
||||
<ul>
|
||||
<li><a href="hpr2610_full_shownotes.epub">ePub version of these notes</a></li>
|
||||
<li>Examples: <a href="hpr2610_awk12_ex1.awk">awk12_ex1.awk</a>, <a href="hpr2610_awk12_ex2.awk">awk12_ex2.awk</a>, <a href="hpr2610_awk12_ex3.awk">awk12_ex3.awk</a>, <a href="hpr2610_awk12_ex4.awk">awk12_ex4.awk</a>, <a href="hpr2610_awk12_ex5.awk">awk12_ex5.awk</a>, <a href="hpr2610_awk12_ex6.awk">awk12_ex6.awk</a>, <a href="hpr2610_awk12_ex7.awk">awk12_ex7.awk</a>, <a href="hpr2610_awk12_ex8.awk">awk12_ex8.awk</a>, <a href="hpr2610_awk12_ex9.awk">awk12_ex9.awk</a>, <a href="hpr2610_awk12_ex10.awk">awk12_ex10.awk</a>, <a href="hpr2610_awk12_extra.awk">awk12_extra.awk</a></li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
</article>
|
||||
</main>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user