import {css} from '@emotion/react'

import ExternalLink from '../../components/ExternalLink'
import Section from '../../components/Section'
import {EMAILS} from '../../constants'
import sectorSrc from '../../assets/sector.svg'
import regionSrc from '../../assets/region.svg'

const imageContainerCss = css`
  display: flex;
  justify-content: center;
`

const imageCss = css`
  height: 200px;
`

export default function LeftColumnContent(): JSX.Element {
  return (
    <>
      <Section title="SPGISpeech">
        <p>
          We are excited to present SPGISpeech (rhymes with “squeegee-speech”), a large-scale
          transcription dataset, freely available for academic research. SPGISpeech is a corpus of
          5,000 hours of professionally-transcribed financial audio. In contrast to previous
          transcription datasets, SPGISpeech contains a broad cross-section of L1 and L2 English
          accents, strongly varying audio quality, and both spontaneous and narrated speech. The
          transcripts have each been cross-checked by multiple professional editors for high
          accuracy and are fully formatted, including capitalization, punctuation, and
          denormalization of non-standard words. You can read more about SPGISpeech{' '}
          <ExternalLink to="https://arxiv.org/abs/2104.02014">here</ExternalLink>.
        </p>
        <p>
          We hope that SPGISpeech will help address the current data gap between industry and
          academic institutions, opening the door for new research efforts in large-scale and
          generalizable speech recognition.
        </p>
      </Section>
      <Section title="About Us">
        <p>
          S&P Global, a leading provider of credit ratings, analytics and data for capital and
          commodity markets worldwide, has a long history of providing vast amounts of structured
          data pertaining to financial markets - including high-quality transcripts of corporate
          earnings calls, management presentations and company acquisition calls.
        </p>
        <p>
          Over the course of the past decade, S&P Global’s team of analysts has manually transcribed
          over 100,000 hours of audio. The transcripts are +99% accurate and subject to strict
          service-level agreements with clients. The calls capture a variety of accents, recording
          qualities, industry sectors and discursive topics.
        </p>
        <figure css={imageContainerCss}>
          <img
            alt="Figure depicting the distribution of sectors within the dataset"
            src={sectorSrc}
            css={imageCss}
          />
        </figure>
        <p>
          Kensho serves as S&P Global’s innovation hub, and our machine learning team has leveraged
          this corpus to build Scribe - a state-of-the-art transcription solution designed to
          transcribe messy, real-world audio in the larger financial space.
        </p>
      </Section>
      <Section title="The Dataset">
        <p>
          SPGISpeech consists of 5,000 hours of recorded company earnings calls and their respective
          transcriptions. The original calls were split into slices ranging from 5 to 15 seconds in
          length to allow easy training for speech recognition systems. Calls represent a broad
          cross-section of international business English; SPGISpeech contains approximately 50,000
          speakers, one of the largest numbers of any speech corpus, and offers a variety of L1 and
          L2 English accents. The format of each WAV file is single channel, 16kHz, 16 bit audio.
        </p>
        <figure css={imageContainerCss}>
          <img
            alt="Figure depicting the distribution of regions within the dataset"
            src={regionSrc}
            css={imageCss}
          />
        </figure>
        <p>
          Transcription text represents the output of several stages of manual post-processing. As
          such, the text contains polished English orthography following a detailed style guide,
          including proper casing, punctuation, and denormalized non-standard words such as numbers
          and acronyms, making SPGISpeech suited for training fully formatted end-to-end models.
        </p>
      </Section>
      <Section title="Example Snippets">
        <p>Below, a few examples of transcript text from the corpus:</p>
        <ul>
          <li>“early in April versus what was going on at the beginning of the quarter?”</li>
          <li>“[...] for the first time in our 92-year history, we”</li>
          <li>“As respects our use of insurance to put out -- reinsurance to put out [...]”</li>
          <li>“in ’15, and got margins back to that kind of mid-teens level [...]”</li>
        </ul>
        These examples illustrate some of the challenges of performing fully-formatted automatic
        speech recognition. In the first example, the audio contains a pitch cue indicating a
        question fragment, though this is not evident from the verbatim text alone. In the second
        example, the spoken phrase “ninety two year” is transcribed with digits and hyphenation. The
        third example records self-correction, an especially common feature of spontaneous speech.
        The last example contains a spoken word “fifteen” that must be transcribed as “’15” (i.e. as
        an abbreviation of the year 2015) on the basis of its semantic role in the sentence.
        Together, these examples showcase not only the challenging nature of fully-formatted
        transcription, but also the potential advantage of transcribing orthography directly from
        audio rather than introducing it in post-processing.
        <p>
          You can find more information about the corpus in our pre-print{' '}
          <ExternalLink to="https://arxiv.org/abs/2104.02014">here</ExternalLink>.
        </p>
      </Section>
      <Section title="Contact Us">
        <p>
          Please reach out to <a href={`mailto:${EMAILS.INFO}`}>{EMAILS.INFO}</a> with any questions
          or concerns.
        </p>
      </Section>
    </>
  )
}
