Move under www to ease rsync
This commit is contained in:
45
www/eps/hpr2238/hpr2238_contacts.awk
Executable file
45
www/eps/hpr2238/hpr2238_contacts.awk
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/awk -f
|
||||
|
||||
#
|
||||
# Define separators
|
||||
#
|
||||
BEGIN{
|
||||
#
|
||||
# The field separator is a newline
|
||||
#
|
||||
FS = "\n"
|
||||
|
||||
#
|
||||
# The record separator is two newlines since there's a blank line between
|
||||
# contacts.
|
||||
#
|
||||
RS = "\n\n"
|
||||
|
||||
#
|
||||
# On output write a line of hyphens on a new line
|
||||
#
|
||||
ORS = "\n----\n"
|
||||
}
|
||||
|
||||
{
|
||||
#
|
||||
# Show where the "beginning of buffer" is
|
||||
#
|
||||
sub(/\`/, "[")
|
||||
|
||||
#
|
||||
# Show where the "end of buffer" is
|
||||
#
|
||||
sub(/\'/, "]")
|
||||
|
||||
#
|
||||
# Show where the start and end of "line" are
|
||||
#
|
||||
sub(/^/, "{")
|
||||
sub(/$/, "}")
|
||||
|
||||
#
|
||||
# Print the buffer with a record number and a field count
|
||||
#
|
||||
print "(" NR "/" NF ")", $0
|
||||
}
|
||||
60
www/eps/hpr2238/hpr2238_contacts.txt
Executable file
60
www/eps/hpr2238/hpr2238_contacts.txt
Executable file
@@ -0,0 +1,60 @@
|
||||
Name: Robin Richardson
|
||||
First: Robin
|
||||
Last: Richardson
|
||||
Email: rrichardson0@163.com
|
||||
Gender: Female
|
||||
|
||||
Name: Anne Price
|
||||
First: Anne
|
||||
Last: Price
|
||||
Email: aprice1@cam.ac.uk
|
||||
Gender: Female
|
||||
|
||||
Name: Annie Warren
|
||||
First: Annie
|
||||
Last: Warren
|
||||
Email: awarren2@huffingtonpost.com
|
||||
Gender: Female
|
||||
|
||||
Name: Dorothy Turner
|
||||
First: Dorothy
|
||||
Last: Turner
|
||||
Email: dturner3@amazon.co.jp
|
||||
Gender: Female
|
||||
|
||||
Name: Barbara Gonzales
|
||||
First: Barbara
|
||||
Last: Gonzales
|
||||
Email: bgonzales4@diigo.com
|
||||
Gender: Female
|
||||
|
||||
Name: Shawn Spencer
|
||||
First: Shawn
|
||||
Last: Spencer
|
||||
Email: sspencer5@usda.gov
|
||||
Gender: Male
|
||||
|
||||
Name: Heather Anderson
|
||||
First: Heather
|
||||
Last: Anderson
|
||||
Email: handerson6@imgur.com
|
||||
Gender: Female
|
||||
|
||||
Name: Benjamin Wells
|
||||
First: Benjamin
|
||||
Last: Wells
|
||||
Email: bwells7@bbc.co.uk
|
||||
Gender: Male
|
||||
|
||||
Name: Elizabeth Little
|
||||
First: Elizabeth
|
||||
Last: Little
|
||||
Email: elittle8@prlog.org
|
||||
Gender: Female
|
||||
|
||||
Name: Joshua Snyder
|
||||
First: Joshua
|
||||
Last: Snyder
|
||||
Email: jsnyder9@dot.gov
|
||||
Gender: Male
|
||||
|
||||
BIN
www/eps/hpr2238/hpr2238_full_shownotes.epub
Executable file
BIN
www/eps/hpr2238/hpr2238_full_shownotes.epub
Executable file
Binary file not shown.
488
www/eps/hpr2238/hpr2238_full_shownotes.html
Executable file
488
www/eps/hpr2238/hpr2238_full_shownotes.html
Executable file
@@ -0,0 +1,488 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="generator" content="pandoc">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||||
<meta name="author" content="Dave Morriss">
|
||||
<title>Gnu Awk - Part 6 (HPR Show 2238)</title>
|
||||
<style type="text/css">code{white-space: pre;}</style>
|
||||
<!--[if lt IE 9]>
|
||||
<script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
|
||||
<![endif]-->
|
||||
<style type="text/css">
|
||||
div.sourceCode { overflow-x: auto; }
|
||||
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
|
||||
margin: 0; padding: 0; vertical-align: baseline; border: none; }
|
||||
table.sourceCode { width: 100%; line-height: 100%; }
|
||||
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
|
||||
td.sourceCode { padding-left: 5px; }
|
||||
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
|
||||
code > span.dt { color: #902000; } /* DataType */
|
||||
code > span.dv { color: #40a070; } /* DecVal */
|
||||
code > span.bn { color: #40a070; } /* BaseN */
|
||||
code > span.fl { color: #40a070; } /* Float */
|
||||
code > span.ch { color: #4070a0; } /* Char */
|
||||
code > span.st { color: #4070a0; } /* String */
|
||||
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
|
||||
code > span.ot { color: #007020; } /* Other */
|
||||
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
|
||||
code > span.fu { color: #06287e; } /* Function */
|
||||
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
|
||||
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
|
||||
code > span.cn { color: #880000; } /* Constant */
|
||||
code > span.sc { color: #4070a0; } /* SpecialChar */
|
||||
code > span.vs { color: #4070a0; } /* VerbatimString */
|
||||
code > span.ss { color: #bb6688; } /* SpecialString */
|
||||
code > span.im { } /* Import */
|
||||
code > span.va { color: #19177c; } /* Variable */
|
||||
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
|
||||
code > span.op { color: #666666; } /* Operator */
|
||||
code > span.bu { } /* BuiltIn */
|
||||
code > span.ex { } /* Extension */
|
||||
code > span.pp { color: #bc7a00; } /* Preprocessor */
|
||||
code > span.at { color: #7d9029; } /* Attribute */
|
||||
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
|
||||
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
|
||||
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
|
||||
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
|
||||
</style>
|
||||
<link rel="stylesheet" href="http://hackerpublicradio.org/css/hpr.css">
|
||||
</head>
|
||||
|
||||
<body id="home">
|
||||
<div id="container" class="shadow">
|
||||
<header>
|
||||
<h1 class="title">Gnu Awk - Part 6 (HPR Show 2238)</h1>
|
||||
<h2 class="author">Dave Morriss</h2>
|
||||
<hr/>
|
||||
</header>
|
||||
|
||||
<main id="maincontent">
|
||||
<article>
|
||||
<header>
|
||||
<h1>Table of Contents</h1>
|
||||
<nav id="TOC">
|
||||
<ul>
|
||||
<li><a href="#introduction">Introduction</a></li>
|
||||
<li><a href="#recap-of-the-last-episode">Recap of the last episode</a><ul>
|
||||
<li><a href="#regular-expressions">Regular expressions</a></li>
|
||||
<li><a href="#replacement">Replacement</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#more-about-regular-expressions">More about regular expressions</a><ul>
|
||||
<li><a href="#more-regular-expression-operators">More regular expression operators</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#functions">Functions</a><ul>
|
||||
<li><a href="#the-sub-function">The <code>sub</code> function</a><ul>
|
||||
<li><a href="#examples-using-sub">Examples using <code>sub</code></a></li>
|
||||
</ul></li>
|
||||
<li><a href="#the-gsub-function">The <code>gsub</code> function</a><ul>
|
||||
<li><a href="#examples-using-gsub">Examples using <code>gsub</code></a></li>
|
||||
</ul></li>
|
||||
<li><a href="#the-gensub-function">The <code>gensub</code> function</a><ul>
|
||||
<li><a href="#first-argument-regexp">First argument: <em>regexp</em></a></li>
|
||||
<li><a href="#second-argument-replacement">Second argument: <em>replacement</em></a></li>
|
||||
<li><a href="#third-argument-how">Third argument: <em>how</em></a></li>
|
||||
<li><a href="#fourth-argument-target">Fourth argument: <em>target</em></a></li>
|
||||
<li><a href="#examples-using-gensub">Examples using <code>gensub</code></a></li>
|
||||
</ul></li>
|
||||
</ul></li>
|
||||
<li><a href="#example-script">Example script</a></li>
|
||||
<li><a href="#warning-for-sed-users">Warning for <code>sed</code> users</a></li>
|
||||
<li><a href="#summary">Summary</a></li>
|
||||
<li><a href="#links">Links</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
</header>
|
||||
<h2 id="introduction">Introduction</h2>
|
||||
<p>This is the sixth episode of the “<a href="http://hackerpublicradio.org/series.php?id=94" title="Learning Awk">Learning Awk</a>” series that <a href="http://hackerpublicradio.org/correspondents.php?hostid=300" title="Mr. Young">Mr. Young</a> and I are doing.</p>
|
||||
<h2 id="recap-of-the-last-episode">Recap of the last episode</h2>
|
||||
<h3 id="regular-expressions">Regular expressions</h3>
|
||||
<p>In the last episode we saw regular expressions in the ‘<em>pattern</em>’ part of a ‘<em>pattern {action}</em>’ sequence. Such a sequence is called a ‘<em>RULE</em>’, (as we have seen in earlier episodes).</p>
|
||||
<pre><code>$1 ~ /p[elu]/ {print $0}</code></pre>
|
||||
<p><strong>Meaning</strong>: <em>If field 1 contains a ‘p’ followed by one of ‘e’, ‘l’ or ‘u’ print the whole line</em>.</p>
|
||||
<pre><code>$2 ~ /e{2}/ {print $0}</code></pre>
|
||||
<p><strong>Meaning</strong>: <em>If field 2 contains two instances of letter ‘e’ in sequence, print the whole line.</em></p>
|
||||
<p>It is usual to enclose the regular expression in slashes, which make it a <em>regexp constant</em> (see the <a href="https://www.gnu.org/software/gawk/manual/gawk.html#Regexp" title="GNU Awk User's Guide: Regular Expressions">GNU Manual</a> for the details of these constants).</p>
|
||||
<p>We had a look at many of the <em>operators</em> used in regular expressions in episode 5. Unfortunately, some small errors crept into the list of operators mentioned in that episode. These are incorrect:</p>
|
||||
<ul>
|
||||
<li><del>\A</del> (beginning of a string)</li>
|
||||
<li><del>\z</del> (end of a string)</li>
|
||||
<li><del>\b</del> (on a word boundary)</li>
|
||||
<li><del>\d</del> (any digit)</li>
|
||||
</ul>
|
||||
<p>The first two operators exist, as does the last one, but only in languages like Perl and Ruby, but not in GNU Awk.</p>
|
||||
<p>For the ‘\b’ sequence the GNU manual says:</p>
|
||||
<blockquote>
|
||||
<p><em>In other GNU software, the word-boundary operator is ‘\b’. However, that conflicts with the awk language’s definition of ‘\b’ as backspace, so gawk uses a different letter. An alternative method would have been to require two backslashes in the GNU operators, but this was deemed too confusing. The current method of using ‘\y’ for the GNU ‘\b’ appears to be the lesser of two evils.</em></p>
|
||||
</blockquote>
|
||||
<p>The corrected list of operators is discussed later in this episode.</p>
|
||||
<h3 id="replacement">Replacement</h3>
|
||||
<p>Last episode we saw the built-in functions that use regular expressions for manipulating strings. These are <code>sub</code>, <code>gsub</code> and <code>gensub</code>. Regular expressions are used in other functions but we will look at them later.</p>
|
||||
<p>We will be looking at <code>sub</code>, <code>gsub</code> and <code>gensub</code> in more detail in this episode.</p>
|
||||
<h2 id="more-about-regular-expressions">More about regular expressions</h2>
|
||||
<h3 id="more-regular-expression-operators">More regular expression operators</h3>
|
||||
<p>We have seen that the regular expressions in GNU Awk use certain characters to denote concepts. For example, ‘.’ is not a full-stop (period) in a regular expression, but means <em>any character</em>. This special meaning can be turned off by preceding the character by a backslash ‘\’. Since a backslash is itself a <em>special</em> character, if you need an actual backslash in a regular expression then precede it with a backslash (‘\\’). We will demonstrate how the backslash might be used in the examples later.</p>
|
||||
<p>Note that (as with GNU sed) some regular expression operators consist of a backslash followed by a character.</p>
|
||||
<p>The following table summarises some of the regular expression operators, including some we have already encountered.</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr class="header">
|
||||
<th style="text-align: left;">Expression</th>
|
||||
<th>Meaning</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><em>any character</em></td>
|
||||
<td>A single ordinary character matches itself</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><strong>.</strong></td>
|
||||
<td>Matches any character</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><strong>*</strong></td>
|
||||
<td>Matches a sequence of zero or more instances of the preceding item</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><strong>[<em>list</em>]</strong></td>
|
||||
<td>Matches any single character in <em>list</em>: for example, [aeiou] matches all vowels</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><strong>[^<em>list</em>]</strong></td>
|
||||
<td>A leading ‘^’ reverses the meaning of <em>list</em>, so that it matches any single character not in <em>list</em></td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><strong>^</strong></td>
|
||||
<td>Matches the beginning of the line (anchors the search at the start)</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><strong>$</strong></td>
|
||||
<td>Matches the end of the line (anchors the search at the end)</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><strong>+</strong></td>
|
||||
<td>Similar to <strong>*</strong> but matches a sequence of one or more instances of the preceding item</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><strong>?</strong></td>
|
||||
<td>Similar to <strong>*</strong> but matches a sequence of zero or one instance of the preceding item</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><strong>{i}</strong></td>
|
||||
<td>Matches exactly <code>i</code> sequences (<code>i</code> is a decimal integer)</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><strong>{i,j}</strong></td>
|
||||
<td>Matches between <code>i</code> and <code>j</code> sequences, inclusive</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><strong>{i,}</strong></td>
|
||||
<td>Matches <code>i</code> or more sequences, inclusive</td>
|
||||
</tr>
|
||||
<tr class="odd">
|
||||
<td style="text-align: left;"><strong>(regexp)</strong></td>
|
||||
<td>Groups the inner <em>regexp</em>. Allows it to be followed by a postfix operator, or can be used for back references (see below)</td>
|
||||
</tr>
|
||||
<tr class="even">
|
||||
<td style="text-align: left;"><strong>regexp1|regexp2</strong></td>
|
||||
<td>Matches <em>regexp1</em> or <em>regexp2</em>, <strong>|</strong> is used to separate alternatives</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<!-- \** -->
|
||||
<p>The expressions ‘[<em>list</em>]’ and ‘[^<em>list</em>]’ are known as <em>bracket expressions</em> in GNU Awk. They represent a single character chosen from the <em>list</em>.</p>
|
||||
<p>To include the characters ‘\’, ‘]’, ‘-’ or ‘^’ in the list precede them with a backslash.</p>
|
||||
<p>The <em>character classes</em> like ‘[:alnum:]’ were dealt with in episode 5. These can only be used in <em>bracket expressions</em> and represent a single character. They are able to deal with extended character data (such as Unicode) whereas the older <em>list</em> syntax cannot.</p>
|
||||
<p>There are a number of GNU Awk (<em>gawk</em>) specific regular expression operators, some of which we touched on in the recap.</p>
|
||||
<dl>
|
||||
<dt><strong>\s</strong></dt>
|
||||
<dd>matches any whitespace character. Equivalent to the ‘[:space:]’ character class in a <em>bracket expression</em> (i.e. ‘[[:space:]]’).
|
||||
</dd>
|
||||
<dt><strong>\S</strong></dt>
|
||||
<dd>matches any character that is not whitespace. Equivalent to ‘[^[:space:]]’.
|
||||
</dd>
|
||||
<dt><strong>\w</strong></dt>
|
||||
<dd>matches any <em>word</em> character. A <em>word</em> character is any letter or digit or the underscore character.
|
||||
</dd>
|
||||
<dt><strong>\W</strong></dt>
|
||||
<dd>matches any <em>non-word</em> character.
|
||||
</dd>
|
||||
<dt><strong>\<</strong></dt>
|
||||
<dd>(<em>backslash</em> <em>less than</em>) matches the empty string at the beginning of a word.
|
||||
</dd>
|
||||
<dt><strong>\></strong></dt>
|
||||
<dd>(<em>backslash</em> <em>greater than</em>) matches the empty string at the end of a word.
|
||||
</dd>
|
||||
<dt><strong>\y</strong></dt>
|
||||
<dd>(<em>backslash</em> <em>y</em>) matches a word boundary; that is it matches if the character to the left is a <em>word</em> character and the character to the right is a <em>non-word</em> character, or vice-versa.
|
||||
</dd>
|
||||
<dt><strong>\B</strong></dt>
|
||||
<dd>Matches everywhere but on a word boundary; that is it matches if the character to the left and the character to the right are either both <em>word</em> characters or both <em>non-word</em> characters. This is essentially the opposite of ‘\y’.
|
||||
</dd>
|
||||
<dt><strong>\`</strong></dt>
|
||||
<dd>(<em>backslash</em> <em>backquote</em>) matches the empty string at the beginning of a string. This is essentially the same as the ‘^’ (<em>circumflex</em> or <em>caret</em>) operator, which means the beginning of the current line ($0), or the start of a string.
|
||||
</dd>
|
||||
<dt><strong>\’</strong></dt>
|
||||
<dd>(<em>backslash</em> <em>single quote</em>) matches the empty string at the end of a string. This is essentially the same as the ‘$’ (<em>dollar sign</em>) operator, which means the end of the current line ($0), or the end of a string.
|
||||
</dd>
|
||||
</dl>
|
||||
<p>GNU Awk can behave as if it is traditional Awk, or will operate only in POSIX mode or can turn on and off other regular expression features. There is a discussion of this in the GNU Awk manual, particularly in the <a href="https://www.gnu.org/software/gawk/manual/gawk.html#Regexp" title="GNU Awk User's Guide: Regular Expressions">Regular Expression</a> section.</p>
|
||||
<h2 id="functions">Functions</h2>
|
||||
<p>The details of the built-in functions we will be looking at here can be found in the GNU Manual in the <a href="https://www.gnu.org/software/gawk/manual/gawk.html#String-Functions" title="GNU Awk User's Guide: String-Manipulation Functions"><em>String-Manipulation Functions</em></a> section.</p>
|
||||
<h3 id="the-sub-function">The <code>sub</code> function</h3>
|
||||
<p>The <code>sub</code> function has the format:</p>
|
||||
<pre><code>sub(regexp, replacement [, target])</code></pre>
|
||||
<p>The first argument <em>regexp</em> is a regular expression. This usually means it is enclosed in ‘//’ delimiters<a href="#fn1" class="footnoteRef" id="fnref1"><sup>1</sup></a>.</p>
|
||||
<p>The second argument <em>replacement</em> is a string to be used to replace the text matched by the <em>regexp</em>. If this contains a ‘&’ character this refers to the text that was matched.</p>
|
||||
<p>The optional third argument <em>target</em> is the name of the string or field that will be changed by the function. It has to be an existing string variable or field since <code>sub</code> changes it in place. If the <em>target</em> is omitted then field ‘$0’ (the whole input line) is modified.</p>
|
||||
<p>The purpose of the <code>sub</code> function is to search the string in the <em>target</em> variable for the longest leftmost match with the <em>regexp</em> argument. This is replaced by the <em>replacement</em> argument.</p>
|
||||
<p>The function returns the number of changes made (which can only be zero or 1).</p>
|
||||
<h4 id="examples-using-sub">Examples using <code>sub</code></h4>
|
||||
<pre><code>$ echo "banana" | awk '{sub(/an/,"XX"); print}'
|
||||
bXXana</code></pre>
|
||||
<p>The first occurrence of the string ‘an’ is matched in the ‘$0’ field, and replaced by ‘XX’.</p>
|
||||
<pre><code>$ echo "banana" | awk '{sub(/an/,"&&"); print}'
|
||||
bananana</code></pre>
|
||||
<p>This time the matched string is replaced by itself twice (‘anan’).</p>
|
||||
<pre><code>$ echo "banana" | awk '{n = sub(/an/,"&&"); print "Changes made=" n, "Result:", $0}'
|
||||
Changes made=1 Result: bananana</code></pre>
|
||||
<p>Here the result of the <code>sub</code> function is stored in the variable <code>n</code> and it and the result are printed.</p>
|
||||
<h3 id="the-gsub-function">The <code>gsub</code> function</h3>
|
||||
<p>The <code>gsub</code> function is similar to <code>sub</code> and has the format:</p>
|
||||
<pre><code>gsub(regexp, replacement [, target])</code></pre>
|
||||
<p>As with <code>sub</code>, the arguments have the same purpose.</p>
|
||||
<p>The function differs in that it searches <em>target</em> for <strong>all</strong> matches, and replaces them. The matches must not overlap (see below).</p>
|
||||
<p>The function returns the number of changes made.</p>
|
||||
<h4 id="examples-using-gsub">Examples using <code>gsub</code></h4>
|
||||
<pre><code>$ echo "banana" | awk '{gsub(/an/,"XX"); print}'
|
||||
bXXXXa</code></pre>
|
||||
<p>All occurrences of the string ‘an’ are matched in the ‘$0’ field, and replaced by ‘XX’.</p>
|
||||
<pre><code>$ echo "banana" | awk '{gsub(/ana/,"XX"); print}'
|
||||
bXXna</code></pre>
|
||||
<p>Here there are two <em>overlapping</em> instances of ‘ana’, but only the first is replaced.</p>
|
||||
<pre><code>$ awk '{n = gsub(/[aeiou]/,"?",$1); printf "%-12s (%d)\n",$1,n}' file1.txt
|
||||
n?m? (2)
|
||||
?ppl? (2)
|
||||
b?n?n? (3)
|
||||
str?wb?rry (2)
|
||||
gr?p? (2)
|
||||
?ppl? (2)
|
||||
pl?m (1)
|
||||
k?w? (2)
|
||||
p?t?t? (3)
|
||||
p?n??ppl? (4)</code></pre>
|
||||
<p>This time we used the example file <code>file1.txt</code> and replaced all vowels with question marks, then captured the number changed. We printed the result and the number of changes.</p>
|
||||
<h3 id="the-gensub-function">The <code>gensub</code> function</h3>
|
||||
<p>This function is different from the other two, and has been added to GNU Awk later than <code>sub</code> and <code>gsub</code><a href="#fn2" class="footnoteRef" id="fnref2"><sup>2</sup></a>:</p>
|
||||
<pre><code>gensub(regexp, replacement, how [, target])</code></pre>
|
||||
<h4 id="first-argument-regexp">First argument: <em>regexp</em></h4>
|
||||
<p>This is a regular expression (usually a <em>regexp constant</em> enclosed in slashes). Any of the regular expression operators seen in this and the last episode can be used. In particular, regular expressions enclosed in parentheses can be used here. (Similar features were described in the “<a href="http://hackerpublicradio.org/series.php?id=90" title="Learning sed">Learning sed</a>” series).</p>
|
||||
<h4 id="second-argument-replacement">Second argument: <em>replacement</em></h4>
|
||||
<p>In this argument, which is a string, the text to use for replacement is defined. This can also contain <em>back references</em> to text “captured” by the parenthesised expressions mentioned above.</p>
|
||||
<p>The back references consist of a backslash followed by a number. If the number is zero then the it refers to the entire regular expression and is equivalent to the ‘&’ character. Otherwise the number may be 1 to 9, referring to a parenthesised group.</p>
|
||||
<p>Because of the way Awk processes strings, it is necessary to double the backslash in this argument. For instance, to refer to parenthesised component number one the string must be “\\1”.</p>
|
||||
<h4 id="third-argument-how">Third argument: <em>how</em></h4>
|
||||
<p>This is a string which should contain ‘G’, ‘g’ or a number.</p>
|
||||
<p>If ‘G’ or ‘g’ (<em>global</em>) it means that all matches should be replaced as specified.</p>
|
||||
<p>If it is a number then it indicates which particular numbered match and replacement should be performed. It is not possible to perform multiple actions with this feature.</p>
|
||||
<h4 id="fourth-argument-target">Fourth argument: <em>target</em></h4>
|
||||
<p>If this optional argument is omitted then the field ‘$0’ is used. Otherwise the argument can be a string, a variable (containing a string) or a field.</p>
|
||||
<p>The <em>target</em> is <strong>not changed</strong> in situ, unlike with <code>sub</code> and <code>gsub</code>. The function returns the changed string instead.</p>
|
||||
<h4 id="examples-using-gensub">Examples using <code>gensub</code></h4>
|
||||
<pre><code>$ echo "banana" | awk '{print gensub(/a/,"A","g"); print}'
|
||||
bAnAnA
|
||||
banana</code></pre>
|
||||
<p>Here <code>gensub</code> matches every occurrence of ‘a’, replacing it with capital ‘A’ globally. Note how we print the result of the <code>gensub</code> call. Note also that ‘$0’ has not changed as can be seen when we print it with the second <code>print</code> statement.</p>
|
||||
<pre><code>$ echo "banana" | awk '{print gensub(/a/,"A","1")}'
|
||||
bAnana</code></pre>
|
||||
<p>In this example we have requested that only the first match be replaced. There is no way to replace anything other than all matches or just one using the <em>how</em> argument.</p>
|
||||
<pre><code>$ echo "banana" | awk '{print gensub(/\Ba\B/,"A","g")}'
|
||||
bAnAna</code></pre>
|
||||
<p>This example shows another way to replace matching letters. In this case we have specified only ’a’s which are not at a word boundary. This is not an ideal solution.</p>
|
||||
<pre><code>$ echo "Hacker Public Radio" | awk '{print gensub(/(\w)(\w+)(\W*)/,"\\2\\1ay\\3","g")}'
|
||||
ackerHay ublicPay adioRay</code></pre>
|
||||
<p>This example shows the use of regular expression groups and back references. The three groups are:</p>
|
||||
<ol type="1">
|
||||
<li>A single “word” character</li>
|
||||
<li>One or more “word” characters</li>
|
||||
<li>Zero or more non-“word” characters</li>
|
||||
</ol>
|
||||
<p>Having matched these items (e.g. ‘H’, ‘acker’ and space for the first word), they are replaced by the second group (‘acker’), the first group (‘H’), the letters ‘ay’ and the third group (space). This is repeated throughout the <em>target</em>.</p>
|
||||
<p>Since the <em>target</em> text consists of three words the regular expression matches three times (since argument <em>how</em> was a ‘g’) and the words are all processed the same way - into primitive “Pig Latin”.</p>
|
||||
<pre><code>$ awk 'BEGIN{print gensub(/(\w)(\w+)(\W*)/,"\\2\\1ay\\3","3","Hacker Public Radio")}'
|
||||
Hacker Public adioRay</code></pre>
|
||||
<p>This example is a variant of the previous one. In this case the entire Awk script is in a ‘BEGIN’ rule, and the <em>target</em> is a string constant. Since argument <em>how</em> is the number 3 then only the third match is replaced.</p>
|
||||
<h2 id="example-script">Example script</h2>
|
||||
<p>I have included a longer example using a new test datafile. The example Awk script is called <code>contacts.awk</code> and the data file is <code>contacts.txt</code>. They are included with this show and links to them are listed below.</p>
|
||||
<p>The test data was generated on a site called “<a href="https://www.mockaroo.com/" title="Mockaroo">Mockaroo</a>”, which was used to generate CSV data. The Vim plugin <a href="https://github.com/chrisbra/csv.vim" title="csv.vim"><code>csv.vim</code></a> was used to reformat this into the final format with the <code>:ConvertData</code> function. Here are the first 8 lines from that file:</p>
|
||||
<pre><code>Name: Robin Richardson
|
||||
First: Robin
|
||||
Last: Richardson
|
||||
Email: rrichardson0@163.com
|
||||
Gender: Female
|
||||
|
||||
Name: Anne Price
|
||||
First: Anne</code></pre>
|
||||
<p>Here is the entire awk script which can be run thus:</p>
|
||||
<pre><code>awk -f contacts.awk contacts.txt</code></pre>
|
||||
<div class="sourceCode"><table class="sourceCode awk numberLines"><tr class="sourceCode"><td class="lineNumbers"><pre>1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
18
|
||||
19
|
||||
20
|
||||
21
|
||||
22
|
||||
23
|
||||
24
|
||||
25
|
||||
26
|
||||
27
|
||||
28
|
||||
29
|
||||
30
|
||||
31
|
||||
32
|
||||
33
|
||||
34
|
||||
35
|
||||
36
|
||||
37
|
||||
38
|
||||
39
|
||||
40
|
||||
41
|
||||
42
|
||||
43
|
||||
44
|
||||
45
|
||||
</pre></td><td class="sourceCode"><pre><code class="sourceCode awk"><span class="co">#!/usr/bin/awk -f</span>
|
||||
|
||||
<span class="co">#</span>
|
||||
<span class="co"># Define separators</span>
|
||||
<span class="co">#</span>
|
||||
<span class="cf">BEGIN</span><span class="kw">{</span>
|
||||
<span class="co">#</span>
|
||||
<span class="co"># The field separator is a newline</span>
|
||||
<span class="co">#</span>
|
||||
<span class="bu">FS</span> <span class="op">=</span> <span class="st">"</span><span class="sc">\n</span><span class="st">"</span>
|
||||
|
||||
<span class="co">#</span>
|
||||
<span class="co"># The record separator is two newlines since there's a blank line between</span>
|
||||
<span class="co"># contacts.</span>
|
||||
<span class="co">#</span>
|
||||
<span class="bu">RS</span> <span class="op">=</span> <span class="st">"</span><span class="sc">\n\n</span><span class="st">"</span>
|
||||
|
||||
<span class="co">#</span>
|
||||
<span class="co"># On output write a line of hyphens on a new line</span>
|
||||
<span class="co">#</span>
|
||||
<span class="bu">ORS</span> <span class="op">=</span> <span class="st">"</span><span class="sc">\n</span><span class="st">----</span><span class="sc">\n</span><span class="st">"</span>
|
||||
<span class="kw">}</span>
|
||||
|
||||
<span class="kw">{</span>
|
||||
<span class="co">#</span>
|
||||
<span class="co"># Show where the "beginning of buffer" is</span>
|
||||
<span class="co">#</span>
|
||||
<span class="fu">sub</span>(<span class="op">/</span>\`<span class="op">/,</span> <span class="st">"["</span>)
|
||||
|
||||
<span class="co">#</span>
|
||||
<span class="co"># Show where the "end of buffer" is</span>
|
||||
<span class="co">#</span>
|
||||
<span class="fu">sub</span>(<span class="op">/</span>\'<span class="op">/,</span> <span class="st">"]"</span>)
|
||||
|
||||
<span class="co">#</span>
|
||||
<span class="co"># Show where the start and end of "line" are</span>
|
||||
<span class="co">#</span>
|
||||
<span class="fu">sub</span>(<span class="op">/^/,</span> <span class="st">"{"</span>)
|
||||
<span class="fu">sub</span>(<span class="op">/</span>$<span class="op">/,</span> <span class="st">"}"</span>)
|
||||
|
||||
<span class="co">#</span>
|
||||
<span class="co"># Print the buffer with a record number and a field count</span>
|
||||
<span class="co">#</span>
|
||||
<span class="kw">print</span> <span class="st">"("</span> <span class="bu">NR</span> <span class="st">"/"</span> <span class="bu">NF</span> <span class="st">")"</span><span class="op">,</span> <span class="dt">$0</span>
|
||||
<span class="kw">}</span></code></pre></td></tr></table></div>
|
||||
<p>The script changes the default separators in order to treat the entire block of lines making up a contact as a single Awk “record”. Each field is separated from the next with a newline, and each “record” is separated from the next by two newlines. For variety when printing the output “records” are separated by a newline, four hyphens and a newline.</p>
|
||||
<p>As it processes each “record” the script marks the positions of four boundaries using some of the regular expression operators we have seen in this episode. It prints the “record” ($0) preceding it by the record number and the number of fields.</p>
|
||||
<p>A sample of the first 8 lines of the output looks like this:</p>
|
||||
<pre><code>(1/5) {[Name: Robin Richardson
|
||||
First: Robin
|
||||
Last: Richardson
|
||||
Email: rrichardson0@163.com
|
||||
Gender: Female]}
|
||||
----
|
||||
(2/5) {[Name: Anne Price
|
||||
First: Anne</code></pre>
|
||||
<h2 id="warning-for-sed-users">Warning for <code>sed</code> users</h2>
|
||||
<p>GNU awk is related to GNU sed, which was covered in the series “<a href="http://hackerpublicradio.org/series.php?id=90" title="Learning sed">Learning sed</a>”. If you listened to that series there is unfortunately some potential for confusion as we learn about GNU Awk. Many of the regular expression operators described for GNU sed are the same as those used in GNU Awk <strong>except</strong> that sed uses a backslash in front of some and Awk does not. Examples are ‘\+’ and ‘\?’ in sed versus ‘+’ and ‘?’ in Awk.</p>
|
||||
<h2 id="summary">Summary</h2>
|
||||
<p>This episode covered:</p>
|
||||
<ul>
|
||||
<li>A recap of the last episode
|
||||
<ul>
|
||||
<li>Correcting some small errors in the list of regular expression operators</li>
|
||||
</ul></li>
|
||||
<li>More detail of regular expression operators</li>
|
||||
<li>A detailed description of the functions <code>sub</code>, <code>gsub</code> and <code>gensub</code> with examples</li>
|
||||
<li>A more complex example Awk script</li>
|
||||
<li>A warning about the differences in regular expressions between sed and Awk</li>
|
||||
</ul>
|
||||
<h2 id="links">Links</h2>
|
||||
<ul>
|
||||
<li><a href="https://www.gnu.org/software/gawk/manual/html_node/index.html"><em>GNU Awk User’s Guide</em></a></li>
|
||||
<li>Previous shows in this series on HPR:
|
||||
<ul>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=2114">“<em>Gnu Awk - Part 1</em>”</a> - episode 2114</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=2129">“<em>Gnu Awk - Part 2</em>”</a> - episode 2129</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=2143">“<em>Gnu Awk - Part 3</em>”</a> - episode 2143</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=2163">“<em>Gnu Awk - Part 4</em>”</a> - episode 2163</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=2184">“<em>Gnu Awk - Part 5</em>”</a> - episode 2184</li>
|
||||
</ul></li>
|
||||
<li>The “<em>Learning sed</em>” series:
|
||||
<ul>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=1976">“Introduction to sed - part 1”</a> - episode 1976</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=1986">“Introduction to sed - part 2”</a> - episode 1986</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=1997">“Introduction to sed - part 3”</a> - episode 1997</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=2011">“Introduction to sed - part 4”</a> - episode 2011</li>
|
||||
<li><a href="http://hackerpublicradio.org/eps.php?id=2060">“Introduction to sed - part 5”</a> - episode 2060</li>
|
||||
</ul></li>
|
||||
<li>The “<a href="https://www.mockaroo.com/">Mockaroo</a>” data generator site</li>
|
||||
<li>The Vim plugin “<a href="https://github.com/chrisbra/csv.vim">csv.vim</a>”</li>
|
||||
<li>Resources:
|
||||
<ul>
|
||||
<li><a href="hpr2238_full_shownotes.epub">ePub version of these notes</a></li>
|
||||
<li><a href="hpr2238_full_shownotes.pdf">PDF version of these notes</a></li>
|
||||
<li>Demonstration of some regex operators: <a href="hpr2238_contacts.awk">contacts.awk</a></li>
|
||||
<li>File of dummy contacts: <a href="hpr2238_contacts.txt">contacts.txt</a></li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
<!--
|
||||
vim: syntax=markdown:ts=8:sw=4:ai:et:tw=78:fo=tcqn:fdm=marker
|
||||
-->
|
||||
<section class="footnotes">
|
||||
<hr />
|
||||
<ol>
|
||||
<li id="fn1"><p>This is a “Regexp Constant”, but there is another form the “Computed Regexp”, which is discussed in the <a href="https://www.gnu.org/software/gawk/manual/gawk.html#Regexp" title="GNU Awk User's Guide: Regular Expressions">GNU Manual</a>.<a href="#fnref1">↩</a></p></li>
|
||||
<li id="fn2"><p>As a possible point of interest, I have a copy of the “<em>GAWK Manual</em>” (as it was called), dated 1992, version 0.14, which does not contain <code>gensub</code>.<a href="#fnref2">↩</a></p></li>
|
||||
</ol>
|
||||
</section>
|
||||
</article>
|
||||
</main>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
BIN
www/eps/hpr2238/hpr2238_full_shownotes.pdf
Executable file
BIN
www/eps/hpr2238/hpr2238_full_shownotes.pdf
Executable file
Binary file not shown.
Reference in New Issue
Block a user